JSONL NDJSON with Python: read, write and stream line by line

JSONL (JSON Lines), also known as NDJSON (Newline Delimited JSON), is a text format where each line is a valid, self-contained JSON object. It is the de facto standard for LLM training data, application logs, and large-scale data pipelines. ## JSONL vs JSON vs CSV: when to use each | Feature | JSON | JSONL | CSV | |---|---|---|---| | Structure | Single object/array | One line = one record | Tabular (rows/columns) | | Streaming | No | **Yes** | Partial | | Append without re-parsing | No | **Yes** | Yes | | Nested data | Yes | **Yes** | No | | Human-readable | High | Medium | High | | Compression | Good | **Very good** | Good | | ML/LLM use | No | **Standard** | No | ## JSONL file structure ```jsonl {"id": 1, "name": "Alice", "tags": ["python", "data"]} {"id": 2, "name": "Bob", "tags": ["web", "devops"]} {"id": 3, "name": "Carol", "score": 9.5, "active": true} ``` Each line is an independent JSON — empty lines are ignored by convention. ## Reading JSONL in Python ```python import json from pathlib import Path # Line-by-line generator (streaming — constant memory) def read_jsonl(path): with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: yield json.loads(line) # Use as generator (efficient for large files) for record in read_jsonl('data.jsonl'): print(record['name']) # Load all into list (only if it fits in memory) records = list(read_jsonl('data.jsonl')) print(f"Total records: {len(records)}") # Count lines without loading into memory def count_jsonl(path): with open(path, 'r', encoding='utf-8') as f: return sum(1 for line in f if line.strip()) total = count_jsonl('data.jsonl') print(f"Records: {total}") ``` ## Writing JSONL in Python ```python import json data = [ {'id': 1, 'text': 'First record', 'value': 42}, {'id': 2, 'text': 'Second record', 'value': 87}, {'id': 3, 'text': 'Third record', 'value': 15}, ] # Write from list with open('output.jsonl', 'w', encoding='utf-8') as f: for record in data: f.write(json.dumps(record, ensure_ascii=False) + '\n') # Append to existing JSONL new_record = {'id': 4, 'text': 'New record', 'value': 99} with open('output.jsonl', 'a', encoding='utf-8') as f: f.write(json.dumps(new_record, ensure_ascii=False) + '\n') print("JSONL written successfully") ``` ## Processing JSONL with pandas ```python import pandas as pd # Read JSONL with pandas (lines=True is key) df = pd.read_json('data.jsonl', lines=True) print(df.head()) print(df.dtypes) # Filter and process df_active = df[df['active'] == True] print(f"Active records: {len(df_active)}") # Save results as JSONL df_active.to_json('active.jsonl', orient='records', lines=True, force_ascii=False) # Read compressed JSONL (gzip) directly df_gz = pd.read_json('data.jsonl.gz', lines=True, compression='gzip') ``` ## Streaming large files For multi-GB JSONL files, streaming avoids loading everything into memory: ```python import json from collections import defaultdict def jsonl_stats(path, numeric_field, group_field=None): """Calculate stats from JSONL without loading into memory.""" groups = defaultdict(list) total = 0 with open(path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: record = json.loads(line) value = record.get(numeric_field) if value is not None: key = record.get(group_field, 'global') if group_field else 'global' groups[key].append(float(value)) total += 1 except json.JSONDecodeError as e: print(f"ERROR line {line_num}: {e}") for group, values in groups.items(): n = len(values) mean = sum(values) / n print(f"{group}: n={n}, mean={mean:.2f}, min={min(values):.2f}, max={max(values):.2f}") print(f"\nValid records: {total}") jsonl_stats('sales.jsonl', 'amount', 'region') ``` ## Transform and filter JSONL pipeline ```python import json def jsonl_pipeline(input_path, output_path, filter_fn, transform_fn=None): """Pipeline: read JSONL → filter → transform → write JSONL.""" processed = 0 written = 0 with open(input_path, 'r', encoding='utf-8') as fin, \ open(output_path, 'w', encoding='utf-8') as fout: for line in fin: line = line.strip() if not line: continue processed += 1 record = json.loads(line) if not filter_fn(record): continue if transform_fn: record = transform_fn(record) fout.write(json.dumps(record, ensure_ascii=False) + '\n') written += 1 print(f"Processed: {processed} | Written: {written}") # Example: filter sales > 1000 and add computed field jsonl_pipeline( 'sales.jsonl', 'large_sales.jsonl', filter_fn=lambda r: r.get('amount', 0) > 1000, transform_fn=lambda r: {**r, 'category': 'large'}, ) ``` ## Convert JSON ↔ JSONL ```python import json, gzip # JSON array → JSONL with open('list.json', 'r') as fin, open('list.jsonl', 'w') as fout: data = json.load(fin) for item in data: fout.write(json.dumps(item, ensure_ascii=False) + '\n') # JSONL → JSON array with open('list.jsonl', 'r') as fin, open('list.json', 'w') as fout: records = [json.loads(l) for l in fin if l.strip()] json.dump(records, fout, ensure_ascii=False, indent=2) # Compress JSONL to gzip (reduces size by 70-80%) with open('data.jsonl', 'rb') as fin, gzip.open('data.jsonl.gz', 'wb') as fout: fout.write(fin.read()) ``` ## Validate JSONL ```python import json def validate_jsonl(path, required_keys=None): """Validate each line is valid JSON and optionally has required keys.""" errors = [] total = 0 with open(path, 'r', encoding='utf-8') as f: for num, line in enumerate(f, 1): line = line.strip() if not line: continue total += 1 try: record = json.loads(line) if required_keys: missing = [k for k in required_keys if k not in record] if missing: errors.append(f"Line {num}: missing keys {missing}") except json.JSONDecodeError as e: errors.append(f"Line {num}: invalid JSON — {e}") print(f"Total: {total} | Errors: {len(errors)}") for err in errors[:10]: print(f" {err}") return len(errors) == 0 validate_jsonl('data.jsonl', required_keys=['id', 'text']) ``` ## Main JSONL use cases - **LLM training data**: OpenAI fine-tuning, Hugging Face datasets use JSONL as standard - **Application logs**: one event per line, easily processed with grep/jq - **Database exports**: MongoDB, Elasticsearch export in JSONL by default - **ETL pipelines**: incremental processing of millions of records without RAM constraints - **Streaming APIs**: chunk-by-chunk responses in AI APIs (ChatGPT, Claude)

JSONL/NDJSON Format: Efficient Data Processing in Python