Extracting tables from PDF to Excel is one of the most common data engineering tasks. Python offers three main libraries for this, each with distinct strengths depending on the PDF type.
## Tool comparison
| Library | PDF type | Accuracy | Speed | Dependencies |
|---|---|---|---|---|
| **pdfplumber** | Native text | High | Fast | pdfminer |
| **tabula-py** | Native text | High | Medium | Java (JRE) |
| **camelot** | Native text | Very high | Slow | Ghostscript |
| **pytesseract** | Scanned (OCR) | Medium | Very slow | Tesseract |
## pdfplumber — fast and flexible extraction
```python
# pip install pdfplumber openpyxl pandas
import pdfplumber
import pandas as pd
# Extract all tables from page 1
with pdfplumber.open('report.pdf') as pdf:
page = pdf.pages[0]
tables = page.extract_tables()
for i, table in enumerate(tables):
df = pd.DataFrame(table[1:], columns=table[0]) # row 0 = header
print(f"\n--- Table {i+1} ---")
print(df.head())
df.to_excel(f'table_{i+1}.xlsx', index=False)
```
## pdfplumber — multiple pages
```python
import pdfplumber
import pandas as pd
def extract_all_tables(pdf_path):
"""Extract and concatenate tables from all pages."""
dfs = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
tables = page.extract_tables()
for i, table in enumerate(tables):
if not table or not table[0]:
continue
df = pd.DataFrame(table[1:], columns=table[0])
df['_page'] = page_num
df['_table'] = i + 1
dfs.append(df)
print(f"Page {page_num}, table {i+1}: {len(df)} rows")
return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
df_all = extract_all_tables('report.pdf')
with pd.ExcelWriter('all_tables.xlsx', engine='openpyxl') as writer:
df_all.to_excel(writer, sheet_name='All', index=False)
print(f"Total rows extracted: {len(df_all)}")
```
## tabula-py — area-based precision
tabula-py wraps Tabula (Java) and excels at tables with clear borders:
```python
# pip install tabula-py
# Requires Java Runtime Environment (JRE)
import tabula
import pandas as pd
# Extract all tables from the PDF
dfs = tabula.read_pdf('report.pdf', pages='all', multiple_tables=True)
print(f"Tables found: {len(dfs)}")
for i, df in enumerate(dfs):
print(f"\n--- Table {i+1} ({len(df)} rows x {len(df.columns)} cols) ---")
print(df.head(3))
# Export first table to Excel
dfs[0].to_excel('first_table.xlsx', index=False)
# Direct export to Excel (skipping DataFrame)
tabula.convert_into('report.pdf', 'tables.xlsx', output_format='xlsx', pages='all')
# Extract from a specific area (points: top, left, bottom, right)
df_area = tabula.read_pdf(
'report.pdf',
pages=2,
area=[100, 50, 400, 550],
multiple_tables=False,
)[0]
```
## camelot — maximum accuracy for complex tables
camelot uses Ghostscript and is the most accurate for tables with explicit borders:
```python
# pip install camelot-py[cv]
# Also requires: pip install ghostscript
import camelot
import pandas as pd
# 'lattice' mode for tables with borders
tables = camelot.read_pdf('report.pdf', flavor='lattice', pages='all')
print(f"Tables found: {tables.n}")
print(tables[0].parsing_report)
# {'accuracy': 99.02, 'whitespace': 12.24, 'order': 1, 'page': 1}
# 'stream' mode for borderless tables (whitespace-delimited)
tables_stream = camelot.read_pdf('report.pdf', flavor='stream', pages='1,2,3')
# Export all to Excel (one sheet per table)
tables.export('result.xlsx', f='excel', compress=False)
# Custom Excel with multiple sheets
with pd.ExcelWriter('result_custom.xlsx', engine='openpyxl') as writer:
for i, table in enumerate(tables):
df = table.df
df.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)
```
## Scanned PDFs — OCR with pytesseract
For image-based PDFs without selectable text:
```python
# pip install pytesseract pdf2image pillow
# Requires Tesseract: apt install tesseract-ocr
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
def ocr_pdf_to_text(pdf_path, language='eng'):
"""Convert scanned PDF to text using OCR."""
images = convert_from_path(pdf_path, dpi=300)
full_text = ''
for i, img in enumerate(images):
text = pytesseract.image_to_string(img, lang=language)
full_text += f'\n--- Page {i+1} ---\n' + text
return full_text
# Structured table data from scanned image
def ocr_table_from_image(image_path):
img = Image.open(image_path)
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DATAFRAME)
# Keep only high-confidence text
reliable = data[data['conf'] > 60][['text', 'left', 'top', 'width', 'height']]
return reliable
```
## Cleaning and exporting to final Excel
```python
import pandas as pd
import re
def clean_dataframe(df):
"""Standard cleanup after PDF extraction."""
df = df.dropna(how='all')
df = df.apply(lambda col: col.map(
lambda x: re.sub(r'\s+', ' ', str(x)).strip() if pd.notna(x) else x
))
df = df.drop_duplicates()
df = df.reset_index(drop=True)
return df
# Full pipeline
import pdfplumber
with pdfplumber.open('data.pdf') as pdf:
raw_tables = []
for page in pdf.pages:
for t in page.extract_tables():
if t and len(t) > 1:
df = pd.DataFrame(t[1:], columns=t[0])
raw_tables.append(df)
clean_tables = [clean_dataframe(df) for df in raw_tables]
# Export to Excel with auto-width columns
with pd.ExcelWriter('final_result.xlsx', engine='openpyxl') as writer:
for i, df in enumerate(clean_tables):
sheet_name = f'Table_{i+1}'[:31] # Excel limits sheet names to 31 chars
df.to_excel(writer, sheet_name=sheet_name, index=False)
sheet = writer.sheets[sheet_name]
for col in sheet.columns:
max_len = max(len(str(c.value or '')) for c in col)
sheet.column_dimensions[col[0].column_letter].width = min(max_len + 2, 50)
print(f"Exported: {len(clean_tables)} tables to final_result.xlsx")
```
## Choosing the right tool
- **pdfplumber**: first choice for most native PDFs. Fast, no heavy dependencies.
- **tabula-py**: ideal when tables have explicit borders and you need area-based extraction.
- **camelot**: for complex corporate PDFs where precision is critical.
- **pytesseract + pdf2image**: essential for scanned PDFs or document photographs.
Guide