Working with PDFs in Python: pypdf, pdfplumber and ReportLab
Python has several libraries for working with PDFs: pypdf for reading, merging and splitting; pdfplumber for precise text and table extraction; ReportLab for creating PDFs from scratch.
1. Installation
pip install pypdf
pip install pdfplumber
pip install reportlab
pip install fpdf2 # Simple alternative to ReportLab
2. pypdf: read and manipulate PDFs
from pypdf import PdfReader, PdfWriter
reader = PdfReader("document.pdf")
print(f"Pages : {len(reader.pages)}")
print(f"Author : {reader.metadata.get('/Author', 'Unknown')}")
print(f"Title : {reader.metadata.get('/Title', 'No title')}")
print(f"Encrypted: {reader.is_encrypted}")
# Extract text from all pages
full_text = ""
for i, page in enumerate(reader.pages):
text = page.extract_text()
print(f"--- Page {i+1} ---")
print(text[:200])
full_text += text + "\n"
# Specific pages
page1 = reader.pages[0].extract_text()
last = reader.pages[-1].extract_text()
3. pypdf: merge, split and reorder
from pypdf import PdfReader, PdfWriter
# Merge multiple PDFs
def merge_pdfs(input_paths, output_path):
writer = PdfWriter()
for path in input_paths:
reader = PdfReader(path)
for page in reader.pages:
writer.add_page(page)
with open(output_path, "wb") as f:
writer.write(f)
merge_pdfs(["part1.pdf", "part2.pdf", "part3.pdf"], "complete.pdf")
# Extract specific pages
def extract_pages(input_path, output_path, pages):
reader = PdfReader(input_path)
writer = PdfWriter()
for num in pages:
writer.add_page(reader.pages[num])
with open(output_path, "wb") as f:
writer.write(f)
extract_pages("report.pdf", "summary.pdf", [0, 1, 2]) # Pages 1-3
# Split into individual pages
def split_by_page(input_path, output_folder):
from pathlib import Path
reader = PdfReader(input_path)
folder = Path(output_folder)
folder.mkdir(exist_ok=True)
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(folder / f"page_{i+1:03d}.pdf", "wb") as f:
writer.write(f)
# Rotate pages
reader = PdfReader("landscape.pdf")
writer = PdfWriter()
for page in reader.pages:
page.rotate(90)
writer.add_page(page)
with open("rotated.pdf", "wb") as f:
writer.write(f)
4. pypdf: encrypt and decrypt PDFs
from pypdf import PdfReader, PdfWriter
def encrypt_pdf(input_path, output_path, user_password, owner_password=None):
reader = PdfReader(input_path)
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
writer.encrypt(
user_password=user_password,
owner_password=owner_password or user_password,
use_128bit=True,
)
with open(output_path, "wb") as f:
writer.write(f)
encrypt_pdf("document.pdf", "protected.pdf", "my_password")
reader = PdfReader("protected.pdf")
if reader.is_encrypted:
reader.decrypt("my_password")
print(reader.pages[0].extract_text())
5. pdfplumber: precise text and table extraction
import pdfplumber
# Extract text with precision
with pdfplumber.open("invoice.pdf") as pdf:
for i, page in enumerate(pdf.pages):
print(f"=== Page {i+1} ===")
text = page.extract_text()
if text:
print(text)
# Extract tables
with pdfplumber.open("financial_report.pdf") as pdf:
page = pdf.pages[0]
tables = page.extract_tables()
for i, table in enumerate(tables):
print(f"\n--- Table {i+1} ---")
for row in table:
print(" | ".join(cell or "" for cell in row))
# Export table to CSV
with pdfplumber.open("data.pdf") as pdf:
table = pdf.pages[0].extract_table()
if table:
import csv
with open("data.csv", "w", newline="", encoding="utf-8") as f:
csv.writer(f).writerows(table)
print("Table exported to data.csv")
6. pdfplumber: words and coordinates
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0]
# Words with coordinates
words = page.extract_words()
for word in words[:10]:
print(f"'{word['text']}' at ({word['x0']:.0f}, {word['top']:.0f})")
# Crop a region (x0, y0, x1, y1)
region = page.crop((50, 100, 550, 400))
print(region.extract_text())
# Search for text
matches = page.search("Total:")
for m in matches:
print(f"'Total:' found at ({m['x0']:.0f}, {m['top']:.0f})")
7. ReportLab: create PDFs from scratch
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.enums import TA_CENTER
def create_invoice(output_path):
doc = SimpleDocTemplate(output_path, pagesize=A4)
styles = getSampleStyleSheet()
elements = []
title_style = ParagraphStyle(
"Title",
parent=styles["Heading1"],
alignment=TA_CENTER,
textColor=colors.HexColor("#1565C0"),
fontSize=24,
)
elements.append(Paragraph("INVOICE #2025-001", title_style))
elements.append(Spacer(1, 0.5*cm))
elements.append(Paragraph("My Company Ltd.", styles["Heading2"]))
elements.append(Paragraph("123 Main St, London | VAT: GB123456789", styles["Normal"]))
elements.append(Spacer(1, 1*cm))
table_data = [
["Description", "Qty", "Price", "Total"],
["Web development", "1", "$2,000.00", "$2,000.00"],
["Graphic design", "3", "$500.00", "$1,500.00"],
["Maintenance", "12", "$150.00", "$1,800.00"],
["", "", "TOTAL:", "$5,300.00"],
]
table = Table(table_data, colWidths=[8*cm, 3*cm, 4*cm, 4*cm])
table.setStyle(TableStyle([
("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#1565C0")),
("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
("ALIGN", (0, 0), (-1, -1), "CENTER"),
("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
("FONTSIZE", (0, 0), (-1, 0), 12),
("BOTTOMPADDING",(0, 0), (-1, 0), 12),
("BACKGROUND", (0, 1), (-1, -2), colors.HexColor("#E3F2FD")),
("GRID", (0, 0), (-1, -2), 1, colors.lightgrey),
("FONTNAME", (0, -1), (-1, -1),"Helvetica-Bold"),
]))
elements.append(table)
elements.append(Spacer(1, 1*cm))
elements.append(Paragraph("Payment due in 30 days. IBAN: GB12 BARC 3456 7890 1234 56", styles["Italic"]))
doc.build(elements)
print(f"Invoice created: {output_path}")
create_invoice("invoice.pdf")
8. fpdf2: simple PDF generation
from fpdf import FPDF
class PDF(FPDF):
def header(self):
self.set_font("Helvetica", "B", 14)
self.cell(0, 10, "Conversion Report", align="C")
self.ln(5)
def footer(self):
self.set_y(-15)
self.set_font("Helvetica", "I", 8)
self.cell(0, 10, f"Page {self.page_no()}", align="C")
pdf = PDF()
pdf.add_page()
pdf.set_font("Helvetica", size=12)
# Simple table
pdf.set_fill_color(21, 101, 192)
pdf.set_text_color(255, 255, 255)
pdf.set_font("Helvetica", "B", 12)
for col, w in [("Format", 50), ("Conversions", 60), ("Avg size", 60)]:
pdf.cell(w, 10, col, border=1, fill=True)
pdf.ln()
pdf.set_text_color(0, 0, 0)
pdf.set_font("Helvetica", size=11)
rows = [("JPG → WebP", "12,345", "1.2 MB"), ("PDF → DOCX", "8,901", "450 KB")]
for row in rows:
for text, w in zip(row, [50, 60, 60]):
pdf.cell(w, 9, text, border=1)
pdf.ln()
pdf.output("report.pdf")
9. Best practices
- pypdf for structural operations: merge, split, rotate, encrypt.
- pdfplumber for extraction: better precision than pypdf for text and tables.
- ReportLab for generation: complex documents with styles, tables and images.
- fpdf2 for simple generation: lower learning curve than ReportLab.
- Not all PDFs are the same: text-based PDFs extract easily; image-based PDFs need OCR (pytesseract).
- Always use context managers:
with open(...)andpdfplumber.open(...).
Related conversions
Document conversions that follow this topic naturally: