Skip to content

Commit 781f4ed

Browse files
committed
feat: validate page count: real PDF pages vs html2pdf4doc pages
1 parent 3584a4f commit 781f4ed

File tree

3 files changed

+46
-10
lines changed

3 files changed

+46
-10
lines changed

html2pdf4doc/html2pdf4doc.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
from datetime import datetime
1111
from pathlib import Path
1212
from time import sleep
13-
from typing import Dict, List, Optional
13+
from typing import Dict, List, Optional, Tuple
1414

1515
import requests
16+
from pypdf import PdfReader
1617
from requests import Response
1718
from selenium import webdriver
1819
from selenium.webdriver.chrome.options import Options
@@ -39,6 +40,17 @@
3940
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False)
4041

4142

43+
def extract_page_count(logs: List[Dict[str, str]]) -> int:
44+
pattern = re.compile(r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)')
45+
for entry_ in logs:
46+
log_message = entry_["message"]
47+
print(f"FOO: {log_message}")
48+
match = pattern.search(log_message)
49+
if match:
50+
return int(match.group(1))
51+
raise ValueError("No page count found in logs.")
52+
53+
4254
class ChromeDriverManager:
4355
def get_chrome_driver(self, path_to_cache_dir: str) -> str:
4456
chrome_version: Optional[str] = self.get_chrome_version()
@@ -253,7 +265,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253265
return mm / 25.4
254266

255267

256-
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
268+
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> Tuple[bytes, int]:
257269
print(f"html2pdf4doc: opening URL with ChromeDriver: {url}") # noqa: T201
258270

259271
driver.get(url)
@@ -285,21 +297,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285297
}
286298

287299
class Done(Exception):
288-
pass
300+
def __init__(self, page_count: int):
301+
super().__init__()
302+
self.page_count: int = page_count
289303

290304
datetime_start = datetime.today()
291305

292306
logs: List[Dict[str, str]] = []
307+
page_count: int = 0
293308
try:
294309
while True:
295310
logs = driver.get_log("browser") # type: ignore[no-untyped-call]
296311
for entry_ in logs:
297312
if "[HTML2PDF4DOC] Total time:" in entry_["message"]:
298313
print("success: HTML2PDF4Doc completed its job.") # noqa: T201
299-
raise Done
314+
315+
page_count = extract_page_count(logs)
316+
317+
raise Done(page_count)
300318
if (datetime.today() - datetime_start).total_seconds() > 60:
301319
raise TimeoutError
302-
sleep(0.5)
320+
sleep(0.1)
303321
except Done:
304322
pass
305323
except TimeoutError:
@@ -322,7 +340,13 @@ class Done(Exception):
322340
result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options)
323341

324342
data = base64.b64decode(result["data"])
325-
return data
343+
344+
if page_count == 0:
345+
raise RuntimeError(
346+
"html2pdf4doc: Something went wrong. "
347+
"Could not capture the printed page count from Chrome."
348+
)
349+
return data, page_count
326350

327351

328352
def create_webdriver(
@@ -521,9 +545,21 @@ def exit_handler() -> None:
521545

522546
url = Path(os.path.abspath(path_to_input_html)).as_uri()
523547

524-
pdf_bytes = get_pdf_from_html(driver, url)
548+
pdf_bytes, page_count = get_pdf_from_html(driver, url)
525549
with open(path_to_output_pdf, "wb") as f:
526550
f.write(pdf_bytes)
551+
552+
reader = PdfReader(path_to_output_pdf)
553+
if len(reader.pages) != page_count:
554+
raise RuntimeError(
555+
"Something went wrong with the printed page. "
556+
f"Page count mismatch: "
557+
f"PDF pages: {reader.pages}, "
558+
f"html2pdf4doc pages: {page_count}."
559+
)
560+
561+
assert reader.pages[0].extract_text() == "Hello world!"
562+
527563
else:
528564
print("html2pdf4doc: unknown command.") # noqa: T201
529565
sys.exit(1)

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ dependencies = [
5656

5757
# requests is used for downloading the Chrome driver.
5858
"requests",
59+
60+
# pypdf is used for validating the printed PDF.
61+
"pypdf>=3.9.0",
5962
]
6063

6164
[project.optional-dependencies]

requirements.development.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,3 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19-
20-
# Integration tests use PyPDF to check the contents of the printed PDF.
21-
pypdf==3.9.0

0 commit comments

Comments
 (0)