From 8b651d807cc1c66caaf29097b42762d6b24567a8 Mon Sep 17 00:00:00 2001 From: Tobias Deiminger Date: Sun, 4 Aug 2024 10:48:44 +0200 Subject: [PATCH] HTML2PDF: System chromedriver for PDF export PDF export requires chrome/chromedriver. Currently StrictDoc always uses webdriver_manager to download a suitable chromedriver and install in a strictdoc cache subdirectory. There may be reasons to prefer a system installation over an adhoc download (e.g. security policy). Notably Debian provides packages that work out-of-the-box for StrictDoc. GitHub Ubuntu CI images have the upstream version pre installed. This adds an CLI option --chromedriver to select an explicit chromedriver. If not given, strictdoc uses webdriver_manager as usual. To use the Debian package, one would call apt install chromium chromium-driver strictdoc export --fromats=html2pdf --chromedriver=/usr/bin/chromedriver . To use chromedriver from GitHub Ubuntu image, one would call strictdoc export --fromats=html2pdf --chromedriver=$CHROMEWEBDRIVER . --- docs/strictdoc_01_user_guide.sdoc | 5 +++- strictdoc/cli/cli_arg_parser.py | 3 ++ strictdoc/cli/command_parser_builder.py | 7 +++++ strictdoc/core/project_config.py | 13 +++++++++ strictdoc/export/html2pdf/html2pdf.py | 29 ++++++++++++------- .../export/html2pdf/html2pdf_generator.py | 5 +++- strictdoc/export/html2pdf/pdf_print_driver.py | 25 ++++++++++------ strictdoc/server/routers/main_router.py | 4 ++- tasks.py | 7 +++++ .../06_system_chromedriver/input.sdoc | 6 ++++ .../06_system_chromedriver/strictdoc.toml | 5 ++++ .../06_system_chromedriver/test.itest | 16 ++++++++++ .../06_system_chromedriver/test_pdf.py | 8 +++++ tests/integration/lit.cfg | 3 ++ .../unit/strictdoc/cli/test_cli_arg_parser.py | 2 +- tox.ini | 2 ++ 16 files changed, 116 insertions(+), 24 deletions(-) create mode 100644 tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc create mode 100644 tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml create mode 100644 tests/integration/features/html2pdf/06_system_chromedriver/test.itest create mode 100644 tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py diff --git a/docs/strictdoc_01_user_guide.sdoc b/docs/strictdoc_01_user_guide.sdoc index bdbd129a9..0fa8fcd08 100644 --- a/docs/strictdoc_01_user_guide.sdoc +++ b/docs/strictdoc_01_user_guide.sdoc @@ -2642,7 +2642,10 @@ There are three methods of PDF printing available: 3. Also in the web interface, by navigating to a 'PDF' view of a document and using the browser's built-in Print function. -The first two methods require the Chrome browser to be installed on the user's computer. +The first two methods require the Chrome browser and chromedriver to be installed on the user's computer. +StrictDoc downloads chromedriver on demand by default, or uses a pre installed executable if +``strictdoc export --chromedriver=/path/to/chromedriver`` or the equivalent ``strictdoc.toml`` option +is given. The third method, the PDF screen, presents a version of the document that is optimized for browser printing. This approach allows for the creation of neatly formatted PDF documents or directly printed documents. Although this method is compatible with any browser, Chrome is recommended for the best printing results. Unlike Firefox and Safari, Chrome maintains the document's internal hyperlinks in the printed PDF. diff --git a/strictdoc/cli/cli_arg_parser.py b/strictdoc/cli/cli_arg_parser.py index ab7ed5b66..4c892ee61 100644 --- a/strictdoc/cli/cli_arg_parser.py +++ b/strictdoc/cli/cli_arg_parser.py @@ -146,6 +146,7 @@ def __init__( reqif_multiline_is_xhtml: bool, reqif_enable_mid: bool, view: Optional[str], + chromedriver: Optional[str], ): assert isinstance(input_paths, list), f"{input_paths}" self.input_paths: List[str] = input_paths @@ -165,6 +166,7 @@ def __init__( self.reqif_enable_mid: bool = reqif_enable_mid self.view: Optional[str] = view self.output_html_root: str = os.path.join(output_dir, "html") + self.chromedriver: Optional[str] = chromedriver def get_path_to_config(self) -> str: # FIXME: The control flow can be improved. @@ -298,6 +300,7 @@ def get_export_config(self) -> ExportCommandConfig: self.args.reqif_multiline_is_xhtml, self.args.reqif_enable_mid, self.args.view, + self.args.chromedriver, ) def get_import_config_reqif(self, _) -> ImportReqIFCommandConfig: diff --git a/strictdoc/cli/command_parser_builder.py b/strictdoc/cli/command_parser_builder.py index 3404e216e..e356f8cfc 100644 --- a/strictdoc/cli/command_parser_builder.py +++ b/strictdoc/cli/command_parser_builder.py @@ -243,6 +243,13 @@ def add_export_command(parent_command_parser): type=str, help="Choose which view will be exported.", ) + command_parser_export.add_argument( + "--chromedriver", + type=str, + help="Path to pre installed chromedriver for html2pdf. " + "If not given, chromedriver is downloaded and saved to" + "strictdoc cache.", + ) add_config_argument(command_parser_export) @staticmethod diff --git a/strictdoc/core/project_config.py b/strictdoc/core/project_config.py index c15ea933a..eccd80cea 100644 --- a/strictdoc/core/project_config.py +++ b/strictdoc/core/project_config.py @@ -95,6 +95,7 @@ def __init__( reqif_enable_mid: bool, reqif_import_markup: Optional[str], config_last_update: Optional[datetime.datetime], + chromedriver: Optional[str], ): assert isinstance(environment, SDocRuntimeEnvironment) if source_root_path is not None: @@ -149,6 +150,7 @@ def __init__( ) self.is_running_on_server: bool = False self.view: Optional[str] = None + self.chromedriver: Optional[str] = chromedriver @staticmethod def default_config(environment: SDocRuntimeEnvironment): @@ -172,6 +174,7 @@ def default_config(environment: SDocRuntimeEnvironment): reqif_enable_mid=False, reqif_import_markup=None, config_last_update=None, + chromedriver=None, ) # Some server command settings can override the project config settings. @@ -194,6 +197,7 @@ def integrate_export_config(self, export_config: ExportCommandConfig): self.filter_sections = export_config.filter_sections self.excel_export_fields = export_config.fields self.view = export_config.view + self.chromedriver = export_config.chromedriver if self.source_root_path is None: source_root_path = export_config.input_paths[0] if not os.path.abspath(source_root_path): @@ -368,6 +372,7 @@ def _load_from_dictionary( reqif_multiline_is_xhtml = False reqif_enable_mid = False reqif_import_markup: Optional[str] = None + chromedriver: Optional[str] = None if "project" in config_dict: project_content = config_dict["project"] @@ -507,6 +512,13 @@ def _load_from_dictionary( assert relation_tuple is not None traceability_matrix_relation_columns.append(relation_tuple) + chromedriver = project_content.get("chromedriver", chromedriver) + if chromedriver is not None and not os.path.isfile(chromedriver): + print( # noqa: T201 + f"warning: strictdoc.toml: chromedriver {chromedriver} " + "not found." + ) + if "server" in config_dict: # FIXME: Introduce at least a basic validation for the host/port. server_content = config_dict["server"] @@ -554,4 +566,5 @@ def _load_from_dictionary( reqif_enable_mid=reqif_enable_mid, reqif_import_markup=reqif_import_markup, config_last_update=config_last_update, + chromedriver=chromedriver, ) diff --git a/strictdoc/export/html2pdf/html2pdf.py b/strictdoc/export/html2pdf/html2pdf.py index af3d02ae2..92670228e 100644 --- a/strictdoc/export/html2pdf/html2pdf.py +++ b/strictdoc/export/html2pdf/html2pdf.py @@ -164,18 +164,20 @@ def get_pdf_from_html(driver, url) -> bytes: return data -def create_webdriver(): +def create_webdriver(chromedriver: Optional[str]): print("HTML2PDF: creating Chrome Driver service.", flush=True) # noqa: T201 + if chromedriver is None: + cache_manager = HTML2PDF_CacheManager( + file_manager=FileManager(os_system_manager=OperationSystemManager()) + ) - cache_manager = HTML2PDF_CacheManager( - file_manager=FileManager(os_system_manager=OperationSystemManager()) - ) - - http_client = HTML2PDF_HTTPClient() - download_manager = WDMDownloadManager(http_client) - path_to_chrome = ChromeDriverManager( - download_manager=download_manager, cache_manager=cache_manager - ).install() + http_client = HTML2PDF_HTTPClient() + download_manager = WDMDownloadManager(http_client) + path_to_chrome = ChromeDriverManager( + download_manager=download_manager, cache_manager=cache_manager + ).install() + else: + path_to_chrome = chromedriver print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}") # noqa: T201 service = Service(path_to_chrome) @@ -211,6 +213,11 @@ def main(): os.environ["WDM_LOCAL"] = "1" parser = argparse.ArgumentParser(description="HTML2PDF printer script.") + parser.add_argument( + "--chromedriver", + type=str, + help="Optional chromedriver path. Downloaded if not given.", + ) parser.add_argument("paths", help="Paths to input HTML file.") args = parser.parse_args() @@ -218,7 +225,7 @@ def main(): separate_path_pairs = paths.split(";") - driver = create_webdriver() + driver = create_webdriver(args.chromedriver) @atexit.register def exit_handler(): diff --git a/strictdoc/export/html2pdf/html2pdf_generator.py b/strictdoc/export/html2pdf/html2pdf_generator.py index b52dd380f..e7fa493f4 100644 --- a/strictdoc/export/html2pdf/html2pdf_generator.py +++ b/strictdoc/export/html2pdf/html2pdf_generator.py @@ -106,6 +106,9 @@ def export_tree( ) pdf_print_driver = PDFPrintDriver() try: - pdf_print_driver.get_pdf_from_html(paths_to_print_argument) + pdf_print_driver.get_pdf_from_html( + project_config, + paths_to_print_argument, + ) except TimeoutError: print("error: HTML2PDF: timeout error.") # noqa: T201 diff --git a/strictdoc/export/html2pdf/pdf_print_driver.py b/strictdoc/export/html2pdf/pdf_print_driver.py index 2961291be..0e0897eb5 100644 --- a/strictdoc/export/html2pdf/pdf_print_driver.py +++ b/strictdoc/export/html2pdf/pdf_print_driver.py @@ -3,26 +3,33 @@ from subprocess import CompletedProcess, TimeoutExpired, run from strictdoc import environment +from strictdoc.core.project_config import ProjectConfig from strictdoc.helpers.timing import measure_performance class PDFPrintDriver: @staticmethod - def get_pdf_from_html(paths_to_print: str): + def get_pdf_from_html( + project_config: ProjectConfig, + paths_to_print: str, + ): assert isinstance(paths_to_print, str) + cmd = [ + # Using sys.executable instead of "python" is important because + # venv subprocess call to python resolves to wrong interpreter, + # https://github.com/python/cpython/issues/86207 + sys.executable, + environment.get_path_to_html2pdf(), + paths_to_print, + ] + if project_config.chromedriver is not None: + cmd.extend(["--chromedriver", project_config.chromedriver]) with measure_performance( "PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver" ): try: _: CompletedProcess = run( - [ - # Using sys.executable instead of "python" is important because - # venv subprocess call to python resolves to wrong interpreter, - # https://github.com/python/cpython/issues/86207 - sys.executable, - environment.get_path_to_html2pdf(), - paths_to_print, - ], + cmd, capture_output=False, check=False, ) diff --git a/strictdoc/server/routers/main_router.py b/strictdoc/server/routers/main_router.py index d0523cead..4a3a028a4 100644 --- a/strictdoc/server/routers/main_router.py +++ b/strictdoc/server/routers/main_router.py @@ -157,6 +157,7 @@ def create_main_router( reqif_multiline_is_xhtml=False, reqif_enable_mid=False, view=None, + chromedriver=None, ) project_config.integrate_export_config(_export_config) project_config.is_running_on_server = True @@ -2589,7 +2590,8 @@ def get_export_html2pdf(document_mid: str): # noqa: ARG001 try: pdf_print_driver.get_pdf_from_html( - f"{path_to_output_html},{path_to_output_pdf}" + project_config, + f"{path_to_output_html},{path_to_output_pdf}", ) except TimeoutError: return Response( diff --git a/tasks.py b/tasks.py index cbfd0a6ce..a74866326 100644 --- a/tasks.py +++ b/tasks.py @@ -332,10 +332,16 @@ def test_integration( if not html2pdf: parallelize_opts = "" if not no_parallelization else "--threads 1" html2pdf_param = "" + chromedriver_param = "" test_folder = f"{cwd}/tests/integration" else: parallelize_opts = "--threads 1" html2pdf_param = "--param TEST_HTML2PDF=1" + chromedriver_path = os.environ.get("CHROMEWEBDRIVER") + assert ( + chromedriver_path is not None + ), "TEST_HTML2PDF expects path to chromedriver in environment variable CHROMEWEBDRIVER" + chromedriver_param = f"--param CHROMEDRIVER={os.path.join(chromedriver_path, 'chromedriver')}" test_folder = f"{cwd}/tests/integration/features/html2pdf" strictdoc_cache_dir = os.path.join(tempfile.gettempdir(), "strictdoc_cache") @@ -345,6 +351,7 @@ def test_integration( --param STRICTDOC_EXEC="{strictdoc_exec}" --param STRICTDOC_CACHE_DIR="{strictdoc_cache_dir}" {html2pdf_param} + {chromedriver_param} -v {debug_opts} {focus_or_none} diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc b/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc new file mode 100644 index 000000000..ef190c2c8 --- /dev/null +++ b/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc @@ -0,0 +1,6 @@ +[DOCUMENT] +TITLE: Dummy Software Requirements Specification #1 + +[FREETEXT] +Hello world! 😊😊😊 +[/FREETEXT] diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml b/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml new file mode 100644 index 000000000..89820bf5e --- /dev/null +++ b/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml @@ -0,0 +1,5 @@ +[project] + +features = [ + "HTML2PDF", +] diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test.itest b/tests/integration/features/html2pdf/06_system_chromedriver/test.itest new file mode 100644 index 000000000..fd2a5ac9e --- /dev/null +++ b/tests/integration/features/html2pdf/06_system_chromedriver/test.itest @@ -0,0 +1,16 @@ +REQUIRES: TEST_HTML2PDF + +# FIXME: Getting timeouts on Windows CI all the time. Needs to be checked or tested by users. +REQUIRES: PLATFORM_IS_NOT_WINDOWS + +# GitHub images provide a chromedriver and export installed location, see +# https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#browsers-and-drivers +RUN: STRICTDOC_CACHE_DIR=%strictdoc_cache_dir %strictdoc export %S --formats=html2pdf --chromedriver=%chromedriver --output-dir Output | filecheck %s --dump-input=fail +CHECK: HTML2PDF: JS logs from the print session +CHECK-NOT: HTML2PDF: Chrome Driver available at path: {{.*}}strictdoc_cache{{.*}} + +RUN: %check_exists --file %S/Output/html2pdf/pdf/input.pdf + +RUN: %check_exists --file %S/Output/html2pdf/html/06_system_chromedriver/input.html + +RUN: python %S/test_pdf.py diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py b/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py new file mode 100644 index 000000000..a3741bbee --- /dev/null +++ b/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py @@ -0,0 +1,8 @@ +from pypdf import PdfReader + +reader = PdfReader("Output/html2pdf/pdf/input.pdf") + +assert len(reader.pages) == 3, reader.pages + +# page2_text = reader.pages[1].extract_text() # noqa: ERA001 +# assert "Table of contents" not in page2_text # noqa: ERA001 diff --git a/tests/integration/lit.cfg b/tests/integration/lit.cfg index 0f168df5b..b029756eb 100644 --- a/tests/integration/lit.cfg +++ b/tests/integration/lit.cfg @@ -41,5 +41,8 @@ if not lit_config.isWindows: config.available_features.add('PLATFORM_IS_NOT_WINDOWS') if "TEST_HTML2PDF" in lit_config.params: + chromedriver = lit_config.params['CHROMEDRIVER'] + assert(chromedriver) config.available_features.add('TEST_HTML2PDF') + config.substitutions.append(('%chromedriver', chromedriver)) config.name = "StrictDoc HTML2PDF integration tests" diff --git a/tests/unit/strictdoc/cli/test_cli_arg_parser.py b/tests/unit/strictdoc/cli/test_cli_arg_parser.py index f744ca46b..4eb5b6d3b 100644 --- a/tests/unit/strictdoc/cli/test_cli_arg_parser.py +++ b/tests/unit/strictdoc/cli/test_cli_arg_parser.py @@ -8,7 +8,7 @@ FAKE_STRICTDOC_ROOT_PATH = "/tmp/strictdoc-123" -TOTAL_EXPORT_ARGS = 17 +TOTAL_EXPORT_ARGS = 18 def cli_args_parser(): diff --git a/tox.ini b/tox.ini index 9917df7d1..2e5b98ee1 100644 --- a/tox.ini +++ b/tox.ini @@ -27,6 +27,8 @@ skip_install = true deps = -rrequirements.bootstrap.txt -rrequirements.check.txt +pass_env= + CHROMEWEBDRIVER commands = python developer/pip_install_strictdoc_deps.py {posargs}