1010from datetime import datetime
1111from pathlib import Path
1212from time import sleep
13- from typing import Dict , List , Optional
13+ from typing import Dict , List , Optional , Tuple
1414
1515import requests
16+ from pypdf import PdfReader
1617from requests import Response
1718from selenium import webdriver
1819from selenium .webdriver .chrome .options import Options
3940sys .stdout = open (sys .stdout .fileno (), mode = "w" , encoding = "utf8" , closefd = False )
4041
4142
43+ def extract_page_count (logs : List [Dict [str , str ]]) -> int :
44+ pattern = re .compile (r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)' )
45+ for entry_ in logs :
46+ log_message = entry_ ["message" ]
47+ print (f"FOO: { log_message } " )
48+ match = pattern .search (log_message )
49+ if match :
50+ return int (match .group (1 ))
51+ raise ValueError ("No page count found in logs." )
52+
53+
4254class ChromeDriverManager :
4355 def get_chrome_driver (self , path_to_cache_dir : str ) -> str :
4456 chrome_version : Optional [str ] = self .get_chrome_version ()
@@ -253,7 +265,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253265 return mm / 25.4
254266
255267
256- def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> bytes :
268+ def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> Tuple [ bytes , int ] :
257269 print (f"html2pdf4doc: opening URL with ChromeDriver: { url } " ) # noqa: T201
258270
259271 driver .get (url )
@@ -285,21 +297,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285297 }
286298
287299 class Done (Exception ):
288- pass
300+ def __init__ (self , page_count : int ):
301+ super ().__init__ ()
302+ self .page_count : int = page_count
289303
290304 datetime_start = datetime .today ()
291305
292306 logs : List [Dict [str , str ]] = []
307+ page_count : int = 0
293308 try :
294309 while True :
295310 logs = driver .get_log ("browser" ) # type: ignore[no-untyped-call]
296311 for entry_ in logs :
297312 if "[HTML2PDF4DOC] Total time:" in entry_ ["message" ]:
298313 print ("success: HTML2PDF4Doc completed its job." ) # noqa: T201
299- raise Done
314+
315+ page_count = extract_page_count (logs )
316+
317+ raise Done (page_count )
300318 if (datetime .today () - datetime_start ).total_seconds () > 60 :
301319 raise TimeoutError
302- sleep (0.5 )
320+ sleep (0.1 )
303321 except Done :
304322 pass
305323 except TimeoutError :
@@ -322,7 +340,13 @@ class Done(Exception):
322340 result = driver .execute_cdp_cmd ("Page.printToPDF" , calculated_print_options )
323341
324342 data = base64 .b64decode (result ["data" ])
325- return data
343+
344+ if page_count == 0 :
345+ raise RuntimeError (
346+ "html2pdf4doc: Something went wrong. "
347+ "Could not capture the printed page count from Chrome."
348+ )
349+ return data , page_count
326350
327351
328352def create_webdriver (
@@ -521,9 +545,21 @@ def exit_handler() -> None:
521545
522546 url = Path (os .path .abspath (path_to_input_html )).as_uri ()
523547
524- pdf_bytes = get_pdf_from_html (driver , url )
548+ pdf_bytes , page_count = get_pdf_from_html (driver , url )
525549 with open (path_to_output_pdf , "wb" ) as f :
526550 f .write (pdf_bytes )
551+
552+ reader = PdfReader (path_to_output_pdf )
553+ if len (reader .pages ) != page_count :
554+ raise RuntimeError (
555+ "Something went wrong with the printed page. "
556+ f"Page count mismatch: "
557+ f"PDF pages: { reader .pages } , "
558+ f"html2pdf4doc pages: { page_count } ."
559+ )
560+
561+ assert reader .pages [0 ].extract_text () == "Hello world!"
562+
527563 else :
528564 print ("html2pdf4doc: unknown command." ) # noqa: T201
529565 sys .exit (1 )
0 commit comments