JSONL NDJSON con Python: leer, escribir y procesar línea a línea

JSONL (JSON Lines), también conocido como NDJSON (Newline Delimited JSON), es un formato de texto donde cada línea es un objeto JSON válido e independiente. Es el estándar de facto para datos de entrenamiento de LLMs, logs de aplicación y pipelines de datos a gran escala. ## JSONL vs JSON vs CSV: cuándo usar cada uno | Característica | JSON | JSONL | CSV | |---|---|---|---| | Estructura | Un objeto/array único | Una línea = un registro | Tabular (filas/columnas) | | Streaming | No | **Sí** | Parcial | | Append sin re-parsear | No | **Sí** | Sí | | Datos anidados | Sí | **Sí** | No | | Legibilidad humana | Alta | Media | Alta | | Compresión | Buena | **Muy buena** | Buena | | Uso en ML/LLMs | No | **Estándar** | No | ## Estructura de un archivo JSONL ```jsonl {"id": 1, "nombre": "Ana", "tags": ["python", "data"]} {"id": 2, "nombre": "Luis", "tags": ["web", "devops"]} {"id": 3, "nombre": "María", "puntuacion": 9.5, "activo": true} ``` Cada línea es un JSON independiente — líneas vacías se ignoran por convención. ## Leer JSONL en Python ```python import json from pathlib import Path # Leer línea a línea (streaming — memoria constante) def leer_jsonl(ruta): with open(ruta, 'r', encoding='utf-8') as f: for linea in f: linea = linea.strip() if linea: # ignorar líneas vacías yield json.loads(linea) # Uso como generador (eficiente con archivos grandes) for registro in leer_jsonl('datos.jsonl'): print(registro['nombre']) # Cargar todo en lista (solo si cabe en memoria) registros = list(leer_jsonl('datos.jsonl')) print(f"Total registros: {len(registros)}") # Contar líneas sin cargar en memoria def contar_lineas_jsonl(ruta): with open(ruta, 'r', encoding='utf-8') as f: return sum(1 for linea in f if linea.strip()) total = contar_lineas_jsonl('datos.jsonl') print(f"Registros: {total}") ``` ## Escribir JSONL en Python ```python import json datos = [ {'id': 1, 'texto': 'Primer registro', 'valor': 42}, {'id': 2, 'texto': 'Segundo registro', 'valor': 87}, {'id': 3, 'texto': 'Tercer registro', 'valor': 15}, ] # Escribir desde lista with open('salida.jsonl', 'w', encoding='utf-8') as f: for registro in datos: f.write(json.dumps(registro, ensure_ascii=False) + '\n') # Añadir registros a un JSONL existente (append) registro_nuevo = {'id': 4, 'texto': 'Registro nuevo', 'valor': 99} with open('salida.jsonl', 'a', encoding='utf-8') as f: f.write(json.dumps(registro_nuevo, ensure_ascii=False) + '\n') print("JSONL escrito correctamente") ``` ## Procesar JSONL con pandas ```python import pandas as pd # Leer JSONL con pandas (lines=True es clave) df = pd.read_json('datos.jsonl', lines=True) print(df.head()) print(df.dtypes) # Filtrar y procesar df_activos = df[df['activo'] == True] print(f"Registros activos: {len(df_activos)}") # Guardar resultados como JSONL df_activos.to_json('activos.jsonl', orient='records', lines=True, force_ascii=False) # Leer JSONL comprimido (gzip) directamente df_gz = pd.read_json('datos.jsonl.gz', lines=True, compression='gzip') ``` ## Procesamiento streaming de archivos grandes Para archivos JSONL de varios GB, el procesamiento en streaming evita cargar todo en memoria: ```python import json from collections import defaultdict def estadisticas_jsonl(ruta, campo_numerico, campo_grupo=None): """Calcula estadísticas de un JSONL sin cargar en memoria.""" grupos = defaultdict(list) total = 0 with open(ruta, 'r', encoding='utf-8') as f: for num_linea, linea in enumerate(f, 1): linea = linea.strip() if not linea: continue try: registro = json.loads(linea) valor = registro.get(campo_numerico) if valor is not None: clave = registro.get(campo_grupo, 'global') if campo_grupo else 'global' grupos[clave].append(float(valor)) total += 1 except json.JSONDecodeError as e: print(f"ERROR línea {num_linea}: {e}") for grupo, valores in grupos.items(): n = len(valores) media = sum(valores) / n print(f"{grupo}: n={n}, media={media:.2f}, min={min(valores):.2f}, max={max(valores):.2f}") print(f"\nTotal registros válidos: {total}") estadisticas_jsonl('ventas.jsonl', 'importe', 'region') ``` ## Transformar y filtrar JSONL en pipeline ```python import json def pipeline_jsonl(entrada, salida, filtro_fn, transformar_fn=None): """Pipeline: leer JSONL → filtrar → transformar → escribir JSONL.""" procesados = 0 escritos = 0 with open(entrada, 'r', encoding='utf-8') as fin, \ open(salida, 'w', encoding='utf-8') as fout: for linea in fin: linea = linea.strip() if not linea: continue procesados += 1 registro = json.loads(linea) if not filtro_fn(registro): continue if transformar_fn: registro = transformar_fn(registro) fout.write(json.dumps(registro, ensure_ascii=False) + '\n') escritos += 1 print(f"Procesados: {procesados} | Escritos: {escritos}") # Ejemplo: filtrar ventas > 1000 y añadir campo calculado pipeline_jsonl( 'ventas.jsonl', 'ventas_grandes.jsonl', filtro_fn=lambda r: r.get('importe', 0) > 1000, transformar_fn=lambda r: {**r, 'categoria': 'grande'}, ) ``` ## Convertir JSON ↔ JSONL ```python import json # JSON array → JSONL with open('lista.json', 'r') as fin, open('lista.jsonl', 'w') as fout: datos = json.load(fin) for item in datos: fout.write(json.dumps(item, ensure_ascii=False) + '\n') # JSONL → JSON array with open('lista.jsonl', 'r') as fin, open('lista.json', 'w') as fout: registros = [json.loads(l) for l in fin if l.strip()] json.dump(registros, fout, ensure_ascii=False, indent=2) # Comprimir JSONL a gzip (reduce tamaño un 70-80%) import gzip with open('datos.jsonl', 'rb') as fin, gzip.open('datos.jsonl.gz', 'wb') as fout: fout.write(fin.read()) ``` ## Validar JSONL ```python import json def validar_jsonl(ruta, schema_keys=None): """Valida que cada línea sea JSON válido y opcionalmente tiene las claves requeridas.""" errores = [] total = 0 with open(ruta, 'r', encoding='utf-8') as f: for num, linea in enumerate(f, 1): linea = linea.strip() if not linea: continue total += 1 try: registro = json.loads(linea) if schema_keys: faltantes = [k for k in schema_keys if k not in registro] if faltantes: errores.append(f"Línea {num}: faltan claves {faltantes}") except json.JSONDecodeError as e: errores.append(f"Línea {num}: JSON inválido — {e}") print(f"Total: {total} | Errores: {len(errores)}") for err in errores[:10]: # mostrar primeros 10 errores print(f" {err}") return len(errores) == 0 validar_jsonl('datos.jsonl', schema_keys=['id', 'texto']) ``` ## Casos de uso principales de JSONL - **Datos de entrenamiento de LLMs**: OpenAI fine-tuning, Hugging Face datasets usan JSONL - **Logs de aplicación**: cada evento = una línea, fácil de procesar con grep/jq - **Export de bases de datos**: MongoDB, Elasticsearch exportan en JSONL por defecto - **Pipelines ETL**: procesamiento incremental de millones de registros sin cargar en RAM - **APIs de streaming**: respuestas chunk a chunk en APIs de IA (ChatGPT, Claude)

Formato JSONL/NDJSON: procesamiento eficiente de datos en Python