feat: validate page count: real PDF pages vs html2pdf4doc pages

stanislaw · stanislaw · commit 781f4ed2350d · 2025-09-07T13:16:17.000+02:00
diff --git a/html2pdf4doc/html2pdf4doc.py b/html2pdf4doc/html2pdf4doc.py
@@ -10,9 +10,10 @@
 from datetime import datetime
 from pathlib import Path
 from time import sleep
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import requests
+from pypdf import PdfReader
 from requests import Response
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
@@ -39,6 +40,17 @@
 sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False)
 
 
+def extract_page_count(logs: List[Dict[str, str]]) -> int:
+    pattern = re.compile(r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)')
+    for entry_ in logs:
+        log_message = entry_["message"]
+        print(f"FOO: {log_message}")
+        match = pattern.search(log_message)
+        if match:
+            return int(match.group(1))
+    raise ValueError("No page count found in logs.")
+
+
 class ChromeDriverManager:
     def get_chrome_driver(self, path_to_cache_dir: str) -> str:
         chrome_version: Optional[str] = self.get_chrome_version()
@@ -253,7 +265,7 @@ def get_inches_from_millimeters(mm: float) -> float:
     return mm / 25.4
 
 
-def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
+def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> Tuple[bytes, int]:
     print(f"html2pdf4doc: opening URL with ChromeDriver: {url}")  # noqa: T201
 
     driver.get(url)
@@ -285,21 +297,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
     }
 
     class Done(Exception):
-        pass
+        def __init__(self, page_count: int):
+            super().__init__()
+            self.page_count: int = page_count
 
     datetime_start = datetime.today()
 
     logs: List[Dict[str, str]] = []
+    page_count: int = 0
     try:
         while True:
             logs = driver.get_log("browser")  # type: ignore[no-untyped-call]
             for entry_ in logs:
                 if "[HTML2PDF4DOC] Total time:" in entry_["message"]:
                     print("success: HTML2PDF4Doc completed its job.")  # noqa: T201
-                    raise Done
+
+                    page_count = extract_page_count(logs)
+
+                    raise Done(page_count)
             if (datetime.today() - datetime_start).total_seconds() > 60:
                 raise TimeoutError
-            sleep(0.5)
+            sleep(0.1)
     except Done:
         pass
     except TimeoutError:
@@ -322,7 +340,13 @@ class Done(Exception):
     result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options)
 
     data = base64.b64decode(result["data"])
-    return data
+
+    if page_count == 0:
+        raise RuntimeError(
+            "html2pdf4doc: Something went wrong. "
+            "Could not capture the printed page count from Chrome."
+        )
+    return data, page_count
 
 
 def create_webdriver(
@@ -521,9 +545,21 @@ def exit_handler() -> None:
 
             url = Path(os.path.abspath(path_to_input_html)).as_uri()
 
-            pdf_bytes = get_pdf_from_html(driver, url)
+            pdf_bytes, page_count = get_pdf_from_html(driver, url)
             with open(path_to_output_pdf, "wb") as f:
                 f.write(pdf_bytes)
+
+            reader = PdfReader(path_to_output_pdf)
+            if len(reader.pages) != page_count:
+                raise RuntimeError(
+                    "Something went wrong with the printed page. "
+                    f"Page count mismatch: "
+                    f"PDF pages: {reader.pages}, "
+                    f"html2pdf4doc pages: {page_count}."
+                )
+
+            assert reader.pages[0].extract_text() == "Hello world!"
+
     else:
         print("html2pdf4doc: unknown command.")  # noqa: T201
         sys.exit(1)
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,6 +56,9 @@ dependencies = [
 
     # requests is used for downloading the Chrome driver.
     "requests",
+
+    # pypdf is used for validating the printed PDF.
+    "pypdf>=3.9.0",
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.development.txt b/requirements.development.txt
@@ -16,6 +16,3 @@ ruff>=0.9
 #
 lit
 filecheck==0.0.24
-
-# Integration tests use PyPDF to check the contents of the printed PDF.
-pypdf==3.9.0

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,9 @@ dependencies = [`
`56`	`56`
`57`	`57`	`# requests is used for downloading the Chrome driver.`
`58`	`58`	`"requests",`
	`59`	`+`
	`60`	`+ # pypdf is used for validating the printed PDF.`
	`61`	`+ "pypdf>=3.9.0",`
`59`	`62`	`]`
`60`	`63`
`61`	`64`	`[project.optional-dependencies]`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,3 @@ ruff>=0.9`
`16`	`16`	`#`
`17`	`17`	`lit`
`18`	`18`	`filecheck==0.0.24`
`19`		`-`
`20`		`-# Integration tests use PyPDF to check the contents of the printed PDF.`
`21`		`-pypdf==3.9.0`