Convert PDF to Images with Python: PyMuPDF, pdf2image and Pillow
Extracting PDF pages as PNG or JPG images is a common task: document previews, diagram extraction, OCR processing, or visual archiving.
Tool Comparison
| Tool | Speed | External Dependencies | Custom DPI |
|---|---|---|---|
| PyMuPDF (fitz) | ⚡ Very fast | pip install pymupdf only |
✅ |
| pdf2image | Good | Poppler (external binary) | ✅ |
| Wand (ImageMagick) | Medium | ImageMagick (binary) | ✅ |
PyMuPDF (fitz) — The Fastest Option
pip install pymupdf
import fitz
import os
def pdf_to_images(pdf_path, output_dir="pages", dpi=150, fmt="png"):
os.makedirs(output_dir, exist_ok=True)
doc = fitz.open(pdf_path)
zoom = dpi / 72
matrix = fitz.Matrix(zoom, zoom)
paths = []
for page_num in range(len(doc)):
pixmap = doc[page_num].get_pixmap(matrix=matrix, alpha=False)
path = f"{output_dir}/page_{page_num+1:03d}.{fmt}"
pixmap.save(path)
paths.append(path)
print(f" {path} ({pixmap.width}x{pixmap.height}px)")
doc.close()
return paths
images = pdf_to_images("document.pdf", dpi=150)
Specific Pages Only
import fitz
doc = fitz.open("report.pdf")
matrix = fitz.Matrix(200/72, 200/72)
for idx in [0, 2, 4]: # pages 1, 3, 5
if idx < len(doc):
pixmap = doc[idx].get_pixmap(matrix=matrix)
pixmap.save(f"page_{idx+1}.png")
doc.close()
Transparent Background (Alpha Channel)
import fitz
doc = fitz.open("logo_in_pdf.pdf")
pixmap = doc[0].get_pixmap(matrix=fitz.Matrix(2, 2), alpha=True)
pixmap.save("with_transparency.png")
doc.close()
Save as JPG
import fitz
doc = fitz.open("catalog.pdf")
for i, page in enumerate(doc):
pix = page.get_pixmap(matrix=fitz.Matrix(150/72, 150/72), alpha=False)
pix.save(f"page_{i+1:03d}.jpg", jpg_quality=85)
doc.close()
pdf2image — Poppler for Maximum Fidelity
pip install pdf2image
# Ubuntu: sudo apt-get install poppler-utils
# macOS: brew install poppler
from pdf2image import convert_from_path, convert_from_bytes
import os
pages = convert_from_path("document.pdf", dpi=150)
os.makedirs("output", exist_ok=True)
for i, page in enumerate(pages):
page.save(f"output/page_{i+1:03d}.png", "PNG")
# From bytes (web APIs)
with open("document.pdf", "rb") as f:
pages = convert_from_bytes(f.read(), dpi=120)
pages[0].save("first_page.jpg", "JPEG", quality=90)
Advanced Options
pages = convert_from_path(
"report.pdf",
dpi=200,
output_folder="./output", # write to disk without loading into RAM
fmt="jpeg",
jpegopt={"quality": 85, "progressive": True},
first_page=2,
last_page=5,
grayscale=False,
size=(1200, None), # fixed width, proportional height
thread_count=4,
)
Large PDFs Without Exhausting RAM
pages = convert_from_path(
"large_book.pdf",
dpi=150,
output_folder="large_pages",
paths_only=True, # returns paths, not PIL objects
)
print(f"Generated {len(pages)} images")
Post-Processing with Pillow
from pdf2image import convert_from_path
from PIL import ImageEnhance
import os
pages = convert_from_path("scan.pdf", dpi=200)
os.makedirs("processed", exist_ok=True)
for i, img in enumerate(pages):
gray = img.convert('L')
enhanced = ImageEnhance.Contrast(gray).enhance(1.5)
binary = enhanced.point(lambda x: 255 if x > 128 else 0, '1')
binary.save(f"processed/page_{i+1:03d}.png")
Extract Embedded Images from PDF
import fitz
doc = fitz.open("catalog.pdf")
for page_num, page in enumerate(doc):
for img_idx, img_ref in enumerate(page.get_images(full=True)):
xref = img_ref[0]
image = doc.extract_image(xref)
name = f"p{page_num+1}_img{img_idx+1}.{image['ext']}"
with open(name, "wb") as f:
f.write(image["image"])
print(f"Extracted: {name}")
doc.close()
OCR with Tesseract
import fitz, pytesseract, io
from PIL import Image
doc = fitz.open("scanned_text.pdf")
all_text = []
matrix = fitz.Matrix(300/72, 300/72) # 300 DPI for OCR
for num, page in enumerate(doc):
pix = page.get_pixmap(matrix=matrix, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("png")))
text = pytesseract.image_to_string(img, lang='eng')
all_text.append(f"--- Page {num+1} ---\n{text}")
print(f"Page {num+1}: {len(text)} characters extracted")
doc.close()
with open("ocr_output.txt", "w", encoding="utf-8") as f:
f.write("\n\n".join(all_text))
DPI by Use Case
| Use case | Recommended DPI | Approx. A4 size |
|---|---|---|
| Web preview | 72–96 | ~400–600 KB |
| On-screen reading | 120–150 | ~1–2 MB |
| Standard printing | 200–300 | ~3–8 MB |
| OCR / archiving | 300 | ~5–10 MB |
| Professional offset | 300–600 | 10–50 MB |
Convert PDF to Images Online
KaijuConverter allows direct PDF to PNG or JPG conversion in the browser, page by page or in bulk.
Related conversions
Document conversions that follow this topic naturally: