PDF to Excel with Python: pdfplumber, tabula & camelot

Extracting tables from PDF to Excel is one of the most common data engineering tasks. Python offers three main libraries for this, each with distinct strengths depending on the PDF type. ## Tool comparison | Library | PDF type | Accuracy | Speed | Dependencies | |---|---|---|---|---| | **pdfplumber** | Native text | High | Fast | pdfminer | | **tabula-py** | Native text | High | Medium | Java (JRE) | | **camelot** | Native text | Very high | Slow | Ghostscript | | **pytesseract** | Scanned (OCR) | Medium | Very slow | Tesseract | ## pdfplumber — fast and flexible extraction ```python # pip install pdfplumber openpyxl pandas import pdfplumber import pandas as pd # Extract all tables from page 1 with pdfplumber.open('report.pdf') as pdf: page = pdf.pages[0] tables = page.extract_tables() for i, table in enumerate(tables): df = pd.DataFrame(table[1:], columns=table[0]) # row 0 = header print(f"\n--- Table {i+1} ---") print(df.head()) df.to_excel(f'table_{i+1}.xlsx', index=False) ``` ## pdfplumber — multiple pages ```python import pdfplumber import pandas as pd def extract_all_tables(pdf_path): """Extract and concatenate tables from all pages.""" dfs = [] with pdfplumber.open(pdf_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): tables = page.extract_tables() for i, table in enumerate(tables): if not table or not table[0]: continue df = pd.DataFrame(table[1:], columns=table[0]) df['_page'] = page_num df['_table'] = i + 1 dfs.append(df) print(f"Page {page_num}, table {i+1}: {len(df)} rows") return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() df_all = extract_all_tables('report.pdf') with pd.ExcelWriter('all_tables.xlsx', engine='openpyxl') as writer: df_all.to_excel(writer, sheet_name='All', index=False) print(f"Total rows extracted: {len(df_all)}") ``` ## tabula-py — area-based precision tabula-py wraps Tabula (Java) and excels at tables with clear borders: ```python # pip install tabula-py # Requires Java Runtime Environment (JRE) import tabula import pandas as pd # Extract all tables from the PDF dfs = tabula.read_pdf('report.pdf', pages='all', multiple_tables=True) print(f"Tables found: {len(dfs)}") for i, df in enumerate(dfs): print(f"\n--- Table {i+1} ({len(df)} rows x {len(df.columns)} cols) ---") print(df.head(3)) # Export first table to Excel dfs[0].to_excel('first_table.xlsx', index=False) # Direct export to Excel (skipping DataFrame) tabula.convert_into('report.pdf', 'tables.xlsx', output_format='xlsx', pages='all') # Extract from a specific area (points: top, left, bottom, right) df_area = tabula.read_pdf( 'report.pdf', pages=2, area=[100, 50, 400, 550], multiple_tables=False, )[0] ``` ## camelot — maximum accuracy for complex tables camelot uses Ghostscript and is the most accurate for tables with explicit borders: ```python # pip install camelot-py[cv] # Also requires: pip install ghostscript import camelot import pandas as pd # 'lattice' mode for tables with borders tables = camelot.read_pdf('report.pdf', flavor='lattice', pages='all') print(f"Tables found: {tables.n}") print(tables[0].parsing_report) # {'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1} # 'stream' mode for borderless tables (whitespace-delimited) tables_stream = camelot.read_pdf('report.pdf', flavor='stream', pages='1,2,3') # Export all to Excel (one sheet per table) tables.export('result.xlsx', f='excel', compress=False) # Custom Excel with multiple sheets with pd.ExcelWriter('result_custom.xlsx', engine='openpyxl') as writer: for i, table in enumerate(tables): df = table.df df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False) ``` ## Scanned PDFs — OCR with pytesseract For image-based PDFs without selectable text: ```python # pip install pytesseract pdf2image pillow # Requires Tesseract: apt install tesseract-ocr import pytesseract from pdf2image import convert_from_path from PIL import Image def ocr_pdf_to_text(pdf_path, language='eng'): """Convert scanned PDF to text using OCR.""" images = convert_from_path(pdf_path, dpi=300) full_text = '' for i, img in enumerate(images): text = pytesseract.image_to_string(img, lang=language) full_text += f'\n--- Page {i+1} ---\n' + text return full_text # Structured table data from scanned image def ocr_table_from_image(image_path): img = Image.open(image_path) data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DATAFRAME) # Keep only high-confidence text reliable = data[data['conf'] > 60][['text', 'left', 'top', 'width', 'height']] return reliable ``` ## Cleaning and exporting to final Excel ```python import pandas as pd import re def clean_dataframe(df): """Standard cleanup after PDF extraction.""" df = df.dropna(how='all') df = df.apply(lambda col: col.map( lambda x: re.sub(r'\s+', ' ', str(x)).strip() if pd.notna(x) else x )) df = df.drop_duplicates() df = df.reset_index(drop=True) return df # Full pipeline import pdfplumber with pdfplumber.open('data.pdf') as pdf: raw_tables = [] for page in pdf.pages: for t in page.extract_tables(): if t and len(t) > 1: df = pd.DataFrame(t[1:], columns=t[0]) raw_tables.append(df) clean_tables = [clean_dataframe(df) for df in raw_tables] # Export to Excel with auto-width columns with pd.ExcelWriter('final_result.xlsx', engine='openpyxl') as writer: for i, df in enumerate(clean_tables): sheet_name = f'Table_{i+1}'[:31] # Excel limits sheet names to 31 chars df.to_excel(writer, sheet_name=sheet_name, index=False) sheet = writer.sheets[sheet_name] for col in sheet.columns: max_len = max(len(str(c.value or '')) for c in col) sheet.column_dimensions[col[0].column_letter].width = min(max_len + 2, 50) print(f"Exported: {len(clean_tables)} tables to final_result.xlsx") ``` ## Choosing the right tool - **pdfplumber**: first choice for most native PDFs. Fast, no heavy dependencies. - **tabula-py**: ideal when tables have explicit borders and you need area-based extraction. - **camelot**: for complex corporate PDFs where precision is critical. - **pytesseract + pdf2image**: essential for scanned PDFs or document photographs.

Extract Tables from PDF to Excel with Python: Complete Guide