Testing WeasyPrint HTML-to-PDF Pipelines with pytest and Visual Regression
WeasyPrint converts HTML/CSS to PDF in Python, making it popular for invoice generation, report rendering, and document exports in Django/Flask apps. Testing WeasyPrint output involves extracting text from the generated PDF bytes, comparing page structure, and doing visual regression checks against PNG renderings. This guide covers the full pytest testing stack for WeasyPrint.
Key Takeaways
Generate PDFs to BytesIO, not files. Use io.BytesIO as the WeasyPrint target for in-memory generation — faster and side-effect-free in tests.
Use PyMuPDF (fitz) for text extraction. PyMuPDF is faster than pdfminer for text extraction and provides good API coverage for page size, page count, and image detection.
Test CSS print rules independently. WeasyPrint respects @page, page-break-*, and @media print — test these rules with Playwright or with WeasyPrint's own CSS parsing API before asserting on PDF output.
Visual snapshots are the best regression net. WeasyPrint renders deterministically for a given HTML+CSS input. Snapshot tests catch layout regressions from CSS or template changes.
Parametrize tests over data variants. Invoice PDFs with 1 line item, 50 line items, and 0 line items behave differently — use @pytest.mark.parametrize to cover all variants.
WeasyPrint Testing Overview
WeasyPrint is a Python library that converts HTML and CSS to PDF. A typical usage:
# src/pdf/generator.py
import io
from weasyprint import HTML, CSS
def generate_invoice_pdf(invoice_data: dict) -> bytes:
"""Generate an invoice PDF from a template and return bytes."""
html_content = render_invoice_template(invoice_data)
buffer = io.BytesIO()
HTML(string=html_content).write_pdf(
buffer,
stylesheets=[CSS(filename='static/css/invoice.css')]
)
return buffer.getvalue()Testing this function requires:
- Calling
generate_invoice_pdf()with test data - Parsing the returned bytes to extract text, page count, and structure
- Optionally rendering to PNG for visual comparison
Setting Up the Test Environment
Install test dependencies:
pip install pytest weasyprint pymupdf pytest-snapshot
# PyMuPDF (imported as 'fitz') for PDF parsingconftest.py with shared fixtures:
# tests/conftest.py
import pytest
from tests.factories import build_invoice, build_multi_page_invoice
@pytest.fixture
def sample_invoice():
return build_invoice(
number="INV-2026-001",
client_name="Acme Corp",
line_items=[
{"description": "Web Development", "quantity": 40, "rate": 150, "total": 6000},
{"description": "Design Consultation", "quantity": 5, "rate": 200, "total": 1000},
],
subtotal=7000,
tax=700,
total=7700,
)
@pytest.fixture
def sample_pdf(sample_invoice):
from src.pdf.generator import generate_invoice_pdf
return generate_invoice_pdf(sample_invoice)Basic PDF Content Tests with PyMuPDF
# tests/pdf/test_invoice_pdf.py
import fitz # PyMuPDF
import pytest
from src.pdf.generator import generate_invoice_pdf
class TestInvoicePdfContent:
def test_generates_valid_pdf_bytes(self, sample_pdf):
assert isinstance(sample_pdf, bytes)
assert len(sample_pdf) > 1000
# PDF files start with the %PDF header
assert sample_pdf[:4] == b"%PDF"
def test_contains_invoice_number(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
full_text = "".join(page.get_text() for page in doc)
doc.close()
assert "INV-2026-001" in full_text
def test_contains_client_name(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
text = doc[0].get_text()
doc.close()
assert "Acme Corp" in text
def test_contains_all_line_items(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
full_text = "".join(page.get_text() for page in doc)
doc.close()
assert "Web Development" in full_text
assert "Design Consultation" in full_text
def test_shows_correct_total(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
full_text = "".join(page.get_text() for page in doc)
doc.close()
# Allow for different formatting: 7,700 or 7700 or 7.700
assert any(fmt in full_text for fmt in ("7,700", "7700", "7.700")), \
f"Total not found in PDF text: {full_text[:500]}"Page Count and Structure Tests
class TestInvoicePdfStructure:
def test_single_page_for_standard_invoice(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
assert doc.page_count == 1
doc.close()
def test_multi_page_for_long_invoice(self):
large_invoice = build_multi_page_invoice(line_item_count=100)
pdf = generate_invoice_pdf(large_invoice)
doc = fitz.open(stream=pdf, filetype="pdf")
assert doc.page_count > 1, "100-item invoice should span multiple pages"
doc.close()
def test_a4_page_size(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
page = doc[0]
# A4 in PDF points: 595 x 842
# Allow ±2 points for rounding
assert abs(page.rect.width - 595) <= 2, f"Expected ~595pt width, got {page.rect.width}"
assert abs(page.rect.height - 842) <= 2, f"Expected ~842pt height, got {page.rect.height}"
doc.close()
@pytest.mark.parametrize("line_item_count,expected_pages", [
(1, 1),
(20, 1),
(50, 2),
(100, 3),
])
def test_page_count_by_line_item_count(self, line_item_count, expected_pages):
invoice = build_multi_page_invoice(line_item_count=line_item_count)
pdf = generate_invoice_pdf(invoice)
doc = fitz.open(stream=pdf, filetype="pdf")
# Allow ± 1 page — exact count depends on font and CSS
assert abs(doc.page_count - expected_pages) <= 1, \
f"{line_item_count} items: expected ~{expected_pages} pages, got {doc.page_count}"
doc.close()PDF Metadata Tests
class TestInvoicePdfMetadata:
def test_pdf_title_is_set(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
metadata = doc.metadata
doc.close()
assert metadata.get("title"), "PDF title should be set"
assert "INV-2026-001" in metadata["title"]
def test_pdf_author_is_set(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
metadata = doc.metadata
doc.close()
assert metadata.get("author") == "HelpMeTest"
def test_pdf_creator_is_set(self, sample_pdf):
doc = fitz.open(stream=sample_pdf, filetype="pdf")
metadata = doc.metadata
doc.close()
assert "WeasyPrint" in (metadata.get("creator") or "")Visual Snapshot Tests
WeasyPrint renders HTML deterministically. Snapshot tests catch regressions from CSS changes:
# tests/pdf/test_invoice_visual.py
import io
import fitz
from pathlib import Path
SNAPSHOTS_DIR = Path("tests/snapshots/pdf")
def render_pdf_page_as_png(pdf_bytes: bytes, page_number: int = 0, dpi: int = 150) -> bytes:
"""Render a PDF page to PNG bytes using PyMuPDF."""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[page_number]
# Render at specified DPI (72 DPI is default, multiply by DPI/72)
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat)
png_bytes = pix.tobytes("png")
doc.close()
return png_bytes
class TestInvoiceVisualRegression:
def test_first_page_matches_snapshot(self, sample_pdf, tmp_path):
SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True)
snapshot_path = SNAPSHOTS_DIR / "invoice-page1.png"
rendered = render_pdf_page_as_png(sample_pdf, page_number=0)
if not snapshot_path.exists():
snapshot_path.write_bytes(rendered)
pytest.skip("Baseline snapshot created — rerun tests to compare")
baseline = snapshot_path.read_bytes()
# Simple byte comparison — deterministic for same input
if rendered != baseline:
# Save diff artifact for investigation
(tmp_path / "rendered.png").write_bytes(rendered)
(tmp_path / "baseline.png").write_bytes(baseline)
pytest.fail(
f"Visual regression detected. Artifacts saved to {tmp_path}. "
"If the change is intentional, delete the snapshot file and rerun."
)For more tolerant visual comparisons (e.g., allowing minor font rendering differences), use pixelmatch via subprocess or the pytest-snapshot plugin with a custom comparison function.
Testing CSS Print Rules
WeasyPrint's CSS rendering can be tested by inspecting the parsed stylesheet:
from weasyprint import CSS
from weasyprint.css import PageType
def test_invoice_css_defines_a4_page():
"""Verify that the invoice CSS correctly sets A4 page size."""
css = CSS(filename="static/css/invoice.css")
# WeasyPrint exposes the computed page rules
# Access the page CSS declarations directly from the parsed stylesheet
page_rules = [
rule for rule in css.matcher.lower_local_name_selectors.get("@page", [])
]
# Alternative: Check via a generated document
import io
from weasyprint import HTML
html = HTML(string="<html><body><p>Test</p></body></html>")
doc = html.render(stylesheets=[css])
page = doc.pages[0]
# A4 in WeasyPrint points (1pt = 1/72 inch)
assert abs(page._page_box.width - 595.3) < 1.0, \
f"Page width should be A4 (595.3pt), got {page._page_box.width}"For testing page-break-* behavior — the most common source of multi-page layout bugs:
def test_table_rows_have_page_break_inside_avoid():
"""Verify that table rows won't split across pages."""
css = CSS(string="tr { page-break-inside: avoid; }")
# Render a long table and verify no row is split
long_table_html = "<table>" + "".join(
f"<tr><td>Row {i}</td><td>${i * 100}</td></tr>"
for i in range(50)
) + "</table>"
html = HTML(string=f"<html><body>{long_table_html}</body></html>")
pdf_bytes = io.BytesIO()
html.write_pdf(pdf_bytes, stylesheets=[css])
# If page-break-inside: avoid is working, no row should be split
# Check by extracting text by page and verifying rows are complete
doc = fitz.open(stream=pdf_bytes.getvalue(), filetype="pdf")
for page in doc:
text = page.get_text()
# Each 'Row N' and '$N00' should appear on the same page
import re
row_numbers = re.findall(r"Row (\d+)", text)
amounts = re.findall(r"\$(\d+00)", text)
assert len(row_numbers) == len(amounts), \
f"Page has mismatched rows/amounts: {row_numbers} vs {amounts}"
doc.close()Django/Flask Integration Tests
If WeasyPrint is used in a web framework view, test the view response:
# Django test
from django.test import TestCase, Client
class InvoiceDownloadViewTest(TestCase):
def setUp(self):
self.client = Client()
self.invoice = Invoice.objects.create(
number="INV-TEST-001",
client_name="Test Corp",
total=5000,
)
def test_invoice_download_returns_pdf(self):
response = self.client.get(f"/invoices/{self.invoice.pk}/download/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response["Content-Type"], "application/pdf")
self.assertIn(
"attachment; filename=",
response["Content-Disposition"]
)
# Verify the response body is a valid PDF
content = b"".join(response.streaming_content) \
if hasattr(response, "streaming_content") \
else response.content
self.assertTrue(content.startswith(b"%PDF"))
def test_invoice_download_contains_invoice_number(self):
response = self.client.get(f"/invoices/{self.invoice.pk}/download/")
content = response.content
doc = fitz.open(stream=content, filetype="pdf")
full_text = "".join(page.get_text() for page in doc)
doc.close()
self.assertIn("INV-TEST-001", full_text)Error Handling Tests
class TestInvoicePdfErrorHandling:
def test_handles_empty_line_items(self):
invoice = build_invoice(number="INV-EMPTY", line_items=[])
pdf = generate_invoice_pdf(invoice)
assert pdf.startswith(b"%PDF")
doc = fitz.open(stream=pdf, filetype="pdf")
text = doc[0].get_text()
doc.close()
assert "INV-EMPTY" in text
def test_handles_very_long_description(self):
invoice = build_invoice(
number="INV-LONG",
line_items=[{
"description": "A" * 500,
"quantity": 1,
"rate": 100,
"total": 100,
}]
)
# Should not raise
pdf = generate_invoice_pdf(invoice)
assert pdf.startswith(b"%PDF")
def test_handles_unicode_content(self):
invoice = build_invoice(
number="INV-UNICODE",
client_name="Müller & Söhne GmbH",
line_items=[{"description": "Café consulting — €200/hr", "quantity": 10, "rate": 200, "total": 2000}]
)
pdf = generate_invoice_pdf(invoice)
doc = fitz.open(stream=pdf, filetype="pdf")
text = doc[0].get_text()
doc.close()
# WeasyPrint supports Unicode via CSS font-face — verify at least partial match
assert "INV-UNICODE" in textCI Configuration
name: WeasyPrint PDF Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install system dependencies (WeasyPrint requires Pango, Cairo)
run: |
sudo apt-get update
sudo apt-get install -y \
libpango-1.0-0 libpangoft2-1.0-0 \
libcairo2 libgdk-pixbuf-2.0-0
- run: pip install -r requirements.txt -r requirements-test.txt
- run: pytest tests/pdf/ -v
- name: Upload visual diff artifacts
if: failure()
uses: actions/upload-artifact@v4
with:
name: pdf-visual-diffs
path: /tmp/pytest-*/WeasyPrint requires Pango, Cairo, and GDK-PixBuf on the host system — these are system packages, not pip packages. Pin the WeasyPrint version in requirements.txt to prevent rendering changes from upstream updates.
Summary
WeasyPrint PDF testing in Python:
| Test concern | Tool |
|---|---|
| Text content | fitz.Page.get_text() |
| Page count | fitz.Document.page_count |
| Page size | fitz.Page.rect |
| Metadata | fitz.Document.metadata |
| Visual regression | fitz.Page.get_pixmap() + byte comparison |
| CSS rules | WeasyPrint render introspection |
| Django/Flask integration | django.test.Client / flask.testing.FlaskClient |
For browser-level tests that verify the full download flow — clicking "Download Invoice" and confirming the file opens correctly — HelpMeTest covers what pytest cannot: the actual user experience of receiving and opening a PDF from a web application.