JSONL (JSON Lines), also known as NDJSON (Newline Delimited JSON), is a text format where each line is a valid, self-contained JSON object. It is the de facto standard for LLM training data, application logs, and large-scale data pipelines.
## JSONL vs JSON vs CSV: when to use each
| Feature | JSON | JSONL | CSV |
|---|---|---|---|
| Structure | Single object/array | One line = one record | Tabular (rows/columns) |
| Streaming | No | **Yes** | Partial |
| Append without re-parsing | No | **Yes** | Yes |
| Nested data | Yes | **Yes** | No |
| Human-readable | High | Medium | High |
| Compression | Good | **Very good** | Good |
| ML/LLM use | No | **Standard** | No |
## JSONL file structure
```jsonl
{"id": 1, "name": "Alice", "tags": ["python", "data"]}
{"id": 2, "name": "Bob", "tags": ["web", "devops"]}
{"id": 3, "name": "Carol", "score": 9.5, "active": true}
```
Each line is an independent JSON β empty lines are ignored by convention.
## Reading JSONL in Python
```python
import json
from pathlib import Path
# Line-by-line generator (streaming β constant memory)
def read_jsonl(path):
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
yield json.loads(line)
# Use as generator (efficient for large files)
for record in read_jsonl('data.jsonl'):
print(record['name'])
# Load all into list (only if it fits in memory)
records = list(read_jsonl('data.jsonl'))
print(f"Total records: {len(records)}")
# Count lines without loading into memory
def count_jsonl(path):
with open(path, 'r', encoding='utf-8') as f:
return sum(1 for line in f if line.strip())
total = count_jsonl('data.jsonl')
print(f"Records: {total}")
```
## Writing JSONL in Python
```python
import json
data = [
{'id': 1, 'text': 'First record', 'value': 42},
{'id': 2, 'text': 'Second record', 'value': 87},
{'id': 3, 'text': 'Third record', 'value': 15},
]
# Write from list
with open('output.jsonl', 'w', encoding='utf-8') as f:
for record in data:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
# Append to existing JSONL
new_record = {'id': 4, 'text': 'New record', 'value': 99}
with open('output.jsonl', 'a', encoding='utf-8') as f:
f.write(json.dumps(new_record, ensure_ascii=False) + '\n')
print("JSONL written successfully")
```
## Processing JSONL with pandas
```python
import pandas as pd
# Read JSONL with pandas (lines=True is key)
df = pd.read_json('data.jsonl', lines=True)
print(df.head())
print(df.dtypes)
# Filter and process
df_active = df[df['active'] == True]
print(f"Active records: {len(df_active)}")
# Save results as JSONL
df_active.to_json('active.jsonl', orient='records', lines=True, force_ascii=False)
# Read compressed JSONL (gzip) directly
df_gz = pd.read_json('data.jsonl.gz', lines=True, compression='gzip')
```
## Streaming large files
For multi-GB JSONL files, streaming avoids loading everything into memory:
```python
import json
from collections import defaultdict
def jsonl_stats(path, numeric_field, group_field=None):
"""Calculate stats from JSONL without loading into memory."""
groups = defaultdict(list)
total = 0
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
value = record.get(numeric_field)
if value is not None:
key = record.get(group_field, 'global') if group_field else 'global'
groups[key].append(float(value))
total += 1
except json.JSONDecodeError as e:
print(f"ERROR line {line_num}: {e}")
for group, values in groups.items():
n = len(values)
mean = sum(values) / n
print(f"{group}: n={n}, mean={mean:.2f}, min={min(values):.2f}, max={max(values):.2f}")
print(f"\nValid records: {total}")
jsonl_stats('sales.jsonl', 'amount', 'region')
```
## Transform and filter JSONL pipeline
```python
import json
def jsonl_pipeline(input_path, output_path, filter_fn, transform_fn=None):
"""Pipeline: read JSONL β filter β transform β write JSONL."""
processed = 0
written = 0
with open(input_path, 'r', encoding='utf-8') as fin, \
open(output_path, 'w', encoding='utf-8') as fout:
for line in fin:
line = line.strip()
if not line:
continue
processed += 1
record = json.loads(line)
if not filter_fn(record):
continue
if transform_fn:
record = transform_fn(record)
fout.write(json.dumps(record, ensure_ascii=False) + '\n')
written += 1
print(f"Processed: {processed} | Written: {written}")
# Example: filter sales > 1000 and add computed field
jsonl_pipeline(
'sales.jsonl',
'large_sales.jsonl',
filter_fn=lambda r: r.get('amount', 0) > 1000,
transform_fn=lambda r: {**r, 'category': 'large'},
)
```
## Convert JSON β JSONL
```python
import json, gzip
# JSON array β JSONL
with open('list.json', 'r') as fin, open('list.jsonl', 'w') as fout:
data = json.load(fin)
for item in data:
fout.write(json.dumps(item, ensure_ascii=False) + '\n')
# JSONL β JSON array
with open('list.jsonl', 'r') as fin, open('list.json', 'w') as fout:
records = [json.loads(l) for l in fin if l.strip()]
json.dump(records, fout, ensure_ascii=False, indent=2)
# Compress JSONL to gzip (reduces size by 70-80%)
with open('data.jsonl', 'rb') as fin, gzip.open('data.jsonl.gz', 'wb') as fout:
fout.write(fin.read())
```
## Validate JSONL
```python
import json
def validate_jsonl(path, required_keys=None):
"""Validate each line is valid JSON and optionally has required keys."""
errors = []
total = 0
with open(path, 'r', encoding='utf-8') as f:
for num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
total += 1
try:
record = json.loads(line)
if required_keys:
missing = [k for k in required_keys if k not in record]
if missing:
errors.append(f"Line {num}: missing keys {missing}")
except json.JSONDecodeError as e:
errors.append(f"Line {num}: invalid JSON β {e}")
print(f"Total: {total} | Errors: {len(errors)}")
for err in errors[:10]:
print(f" {err}")
return len(errors) == 0
validate_jsonl('data.jsonl', required_keys=['id', 'text'])
```
## Main JSONL use cases
- **LLM training data**: OpenAI fine-tuning, Hugging Face datasets use JSONL as standard
- **Application logs**: one event per line, easily processed with grep/jq
- **Database exports**: MongoDB, Elasticsearch export in JSONL by default
- **ETL pipelines**: incremental processing of millions of records without RAM constraints
- **Streaming APIs**: chunk-by-chunk responses in AI APIs (ChatGPT, Claude)
Guide