Skip to content

Commit

Permalink
HTML2PDF: System chromedriver for PDF export
Browse files Browse the repository at this point in the history
PDF export requires chrome/chromedriver. Currently StrictDoc always uses
webdriver_manager to download a suitable chromedriver and install in a
strictdoc cache subdirectory.

There may be reasons to prefer a system installation over an adhoc
download (e.g. security policy). Notably Debian provides packages that
work out-of-the-box for StrictDoc. GitHub Ubuntu CI images have the
upstream version pre installed.

This adds an CLI option --chromedriver to select an explicit
chromedriver. If not given, strictdoc uses webdriver_manager as usual.

To use the Debian package, one would call

 apt install chromium chromium-driver
 strictdoc export --fromats=html2pdf --chromedriver=/usr/bin/chromedriver .

To use chromedriver from GitHub Ubuntu image, one would call

 strictdoc export --fromats=html2pdf --chromedriver=$CHROMEWEBDRIVER .
  • Loading branch information
haxtibal committed Aug 7, 2024
1 parent c639d25 commit 8b651d8
Show file tree
Hide file tree
Showing 16 changed files with 116 additions and 24 deletions.
5 changes: 4 additions & 1 deletion docs/strictdoc_01_user_guide.sdoc
Original file line number Diff line number Diff line change
Expand Up @@ -2642,7 +2642,10 @@ There are three methods of PDF printing available:

3. Also in the web interface, by navigating to a 'PDF' view of a document and using the browser's built-in Print function.

The first two methods require the Chrome browser to be installed on the user's computer.
The first two methods require the Chrome browser and chromedriver to be installed on the user's computer.
StrictDoc downloads chromedriver on demand by default, or uses a pre installed executable if
``strictdoc export --chromedriver=/path/to/chromedriver`` or the equivalent ``strictdoc.toml`` option
is given.

The third method, the PDF screen, presents a version of the document that is optimized for browser printing. This approach allows for the creation of neatly formatted PDF documents or directly printed documents. Although this method is compatible with any browser, Chrome is recommended for the best printing results. Unlike Firefox and Safari, Chrome maintains the document's internal hyperlinks in the printed PDF.

Expand Down
3 changes: 3 additions & 0 deletions strictdoc/cli/cli_arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def __init__(
reqif_multiline_is_xhtml: bool,
reqif_enable_mid: bool,
view: Optional[str],
chromedriver: Optional[str],
):
assert isinstance(input_paths, list), f"{input_paths}"
self.input_paths: List[str] = input_paths
Expand All @@ -165,6 +166,7 @@ def __init__(
self.reqif_enable_mid: bool = reqif_enable_mid
self.view: Optional[str] = view
self.output_html_root: str = os.path.join(output_dir, "html")
self.chromedriver: Optional[str] = chromedriver

def get_path_to_config(self) -> str:
# FIXME: The control flow can be improved.
Expand Down Expand Up @@ -298,6 +300,7 @@ def get_export_config(self) -> ExportCommandConfig:
self.args.reqif_multiline_is_xhtml,
self.args.reqif_enable_mid,
self.args.view,
self.args.chromedriver,
)

def get_import_config_reqif(self, _) -> ImportReqIFCommandConfig:
Expand Down
7 changes: 7 additions & 0 deletions strictdoc/cli/command_parser_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ def add_export_command(parent_command_parser):
type=str,
help="Choose which view will be exported.",
)
command_parser_export.add_argument(
"--chromedriver",
type=str,
help="Path to pre installed chromedriver for html2pdf. "
"If not given, chromedriver is downloaded and saved to"
"strictdoc cache.",
)
add_config_argument(command_parser_export)

@staticmethod
Expand Down
13 changes: 13 additions & 0 deletions strictdoc/core/project_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def __init__(
reqif_enable_mid: bool,
reqif_import_markup: Optional[str],
config_last_update: Optional[datetime.datetime],
chromedriver: Optional[str],
):
assert isinstance(environment, SDocRuntimeEnvironment)
if source_root_path is not None:
Expand Down Expand Up @@ -149,6 +150,7 @@ def __init__(
)
self.is_running_on_server: bool = False
self.view: Optional[str] = None
self.chromedriver: Optional[str] = chromedriver

@staticmethod
def default_config(environment: SDocRuntimeEnvironment):
Expand All @@ -172,6 +174,7 @@ def default_config(environment: SDocRuntimeEnvironment):
reqif_enable_mid=False,
reqif_import_markup=None,
config_last_update=None,
chromedriver=None,
)

# Some server command settings can override the project config settings.
Expand All @@ -194,6 +197,7 @@ def integrate_export_config(self, export_config: ExportCommandConfig):
self.filter_sections = export_config.filter_sections
self.excel_export_fields = export_config.fields
self.view = export_config.view
self.chromedriver = export_config.chromedriver
if self.source_root_path is None:
source_root_path = export_config.input_paths[0]
if not os.path.abspath(source_root_path):
Expand Down Expand Up @@ -368,6 +372,7 @@ def _load_from_dictionary(
reqif_multiline_is_xhtml = False
reqif_enable_mid = False
reqif_import_markup: Optional[str] = None
chromedriver: Optional[str] = None

if "project" in config_dict:
project_content = config_dict["project"]
Expand Down Expand Up @@ -507,6 +512,13 @@ def _load_from_dictionary(
assert relation_tuple is not None
traceability_matrix_relation_columns.append(relation_tuple)

chromedriver = project_content.get("chromedriver", chromedriver)
if chromedriver is not None and not os.path.isfile(chromedriver):
print( # noqa: T201
f"warning: strictdoc.toml: chromedriver {chromedriver} "
"not found."
)

if "server" in config_dict:
# FIXME: Introduce at least a basic validation for the host/port.
server_content = config_dict["server"]
Expand Down Expand Up @@ -554,4 +566,5 @@ def _load_from_dictionary(
reqif_enable_mid=reqif_enable_mid,
reqif_import_markup=reqif_import_markup,
config_last_update=config_last_update,
chromedriver=chromedriver,
)
29 changes: 18 additions & 11 deletions strictdoc/export/html2pdf/html2pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,20 @@ def get_pdf_from_html(driver, url) -> bytes:
return data


def create_webdriver():
def create_webdriver(chromedriver: Optional[str]):
print("HTML2PDF: creating Chrome Driver service.", flush=True) # noqa: T201
if chromedriver is None:
cache_manager = HTML2PDF_CacheManager(
file_manager=FileManager(os_system_manager=OperationSystemManager())
)

cache_manager = HTML2PDF_CacheManager(
file_manager=FileManager(os_system_manager=OperationSystemManager())
)

http_client = HTML2PDF_HTTPClient()
download_manager = WDMDownloadManager(http_client)
path_to_chrome = ChromeDriverManager(
download_manager=download_manager, cache_manager=cache_manager
).install()
http_client = HTML2PDF_HTTPClient()
download_manager = WDMDownloadManager(http_client)
path_to_chrome = ChromeDriverManager(
download_manager=download_manager, cache_manager=cache_manager
).install()
else:
path_to_chrome = chromedriver
print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}") # noqa: T201

service = Service(path_to_chrome)
Expand Down Expand Up @@ -211,14 +213,19 @@ def main():
os.environ["WDM_LOCAL"] = "1"

parser = argparse.ArgumentParser(description="HTML2PDF printer script.")
parser.add_argument(
"--chromedriver",
type=str,
help="Optional chromedriver path. Downloaded if not given.",
)
parser.add_argument("paths", help="Paths to input HTML file.")
args = parser.parse_args()

paths = args.paths

separate_path_pairs = paths.split(";")

driver = create_webdriver()
driver = create_webdriver(args.chromedriver)

@atexit.register
def exit_handler():
Expand Down
5 changes: 4 additions & 1 deletion strictdoc/export/html2pdf/html2pdf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def export_tree(
)
pdf_print_driver = PDFPrintDriver()
try:
pdf_print_driver.get_pdf_from_html(paths_to_print_argument)
pdf_print_driver.get_pdf_from_html(
project_config,
paths_to_print_argument,
)
except TimeoutError:
print("error: HTML2PDF: timeout error.") # noqa: T201
25 changes: 16 additions & 9 deletions strictdoc/export/html2pdf/pdf_print_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,33 @@
from subprocess import CompletedProcess, TimeoutExpired, run

from strictdoc import environment
from strictdoc.core.project_config import ProjectConfig
from strictdoc.helpers.timing import measure_performance


class PDFPrintDriver:
@staticmethod
def get_pdf_from_html(paths_to_print: str):
def get_pdf_from_html(
project_config: ProjectConfig,
paths_to_print: str,
):
assert isinstance(paths_to_print, str)
cmd = [
# Using sys.executable instead of "python" is important because
# venv subprocess call to python resolves to wrong interpreter,
# https://github.com/python/cpython/issues/86207
sys.executable,
environment.get_path_to_html2pdf(),
paths_to_print,
]
if project_config.chromedriver is not None:
cmd.extend(["--chromedriver", project_config.chromedriver])
with measure_performance(
"PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
):
try:
_: CompletedProcess = run(
[
# Using sys.executable instead of "python" is important because
# venv subprocess call to python resolves to wrong interpreter,
# https://github.com/python/cpython/issues/86207
sys.executable,
environment.get_path_to_html2pdf(),
paths_to_print,
],
cmd,
capture_output=False,
check=False,
)
Expand Down
4 changes: 3 additions & 1 deletion strictdoc/server/routers/main_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def create_main_router(
reqif_multiline_is_xhtml=False,
reqif_enable_mid=False,
view=None,
chromedriver=None,
)
project_config.integrate_export_config(_export_config)
project_config.is_running_on_server = True
Expand Down Expand Up @@ -2589,7 +2590,8 @@ def get_export_html2pdf(document_mid: str): # noqa: ARG001

try:
pdf_print_driver.get_pdf_from_html(
f"{path_to_output_html},{path_to_output_pdf}"
project_config,
f"{path_to_output_html},{path_to_output_pdf}",
)
except TimeoutError:
return Response(
Expand Down
7 changes: 7 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,16 @@ def test_integration(
if not html2pdf:
parallelize_opts = "" if not no_parallelization else "--threads 1"
html2pdf_param = ""
chromedriver_param = ""
test_folder = f"{cwd}/tests/integration"
else:
parallelize_opts = "--threads 1"
html2pdf_param = "--param TEST_HTML2PDF=1"
chromedriver_path = os.environ.get("CHROMEWEBDRIVER")
assert (
chromedriver_path is not None
), "TEST_HTML2PDF expects path to chromedriver in environment variable CHROMEWEBDRIVER"
chromedriver_param = f"--param CHROMEDRIVER={os.path.join(chromedriver_path, 'chromedriver')}"
test_folder = f"{cwd}/tests/integration/features/html2pdf"

strictdoc_cache_dir = os.path.join(tempfile.gettempdir(), "strictdoc_cache")
Expand All @@ -345,6 +351,7 @@ def test_integration(
--param STRICTDOC_EXEC="{strictdoc_exec}"
--param STRICTDOC_CACHE_DIR="{strictdoc_cache_dir}"
{html2pdf_param}
{chromedriver_param}
-v
{debug_opts}
{focus_or_none}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[DOCUMENT]
TITLE: Dummy Software Requirements Specification #1

[FREETEXT]
Hello world! 😊😊😊
[/FREETEXT]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[project]

features = [
"HTML2PDF",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
REQUIRES: TEST_HTML2PDF

# FIXME: Getting timeouts on Windows CI all the time. Needs to be checked or tested by users.
REQUIRES: PLATFORM_IS_NOT_WINDOWS

# GitHub images provide a chromedriver and export installed location, see
# https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#browsers-and-drivers
RUN: STRICTDOC_CACHE_DIR=%strictdoc_cache_dir %strictdoc export %S --formats=html2pdf --chromedriver=%chromedriver --output-dir Output | filecheck %s --dump-input=fail
CHECK: HTML2PDF: JS logs from the print session
CHECK-NOT: HTML2PDF: Chrome Driver available at path: {{.*}}strictdoc_cache{{.*}}

RUN: %check_exists --file %S/Output/html2pdf/pdf/input.pdf

RUN: %check_exists --file %S/Output/html2pdf/html/06_system_chromedriver/input.html

RUN: python %S/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pypdf import PdfReader

reader = PdfReader("Output/html2pdf/pdf/input.pdf")

assert len(reader.pages) == 3, reader.pages

# page2_text = reader.pages[1].extract_text() # noqa: ERA001
# assert "Table of contents" not in page2_text # noqa: ERA001
3 changes: 3 additions & 0 deletions tests/integration/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,8 @@ if not lit_config.isWindows:
config.available_features.add('PLATFORM_IS_NOT_WINDOWS')

if "TEST_HTML2PDF" in lit_config.params:
chromedriver = lit_config.params['CHROMEDRIVER']
assert(chromedriver)
config.available_features.add('TEST_HTML2PDF')
config.substitutions.append(('%chromedriver', chromedriver))
config.name = "StrictDoc HTML2PDF integration tests"
2 changes: 1 addition & 1 deletion tests/unit/strictdoc/cli/test_cli_arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
FAKE_STRICTDOC_ROOT_PATH = "/tmp/strictdoc-123"


TOTAL_EXPORT_ARGS = 17
TOTAL_EXPORT_ARGS = 18


def cli_args_parser():
Expand Down
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ skip_install = true
deps =
-rrequirements.bootstrap.txt
-rrequirements.check.txt
pass_env=
CHROMEWEBDRIVER
commands =
python developer/pip_install_strictdoc_deps.py
{posargs}
Expand Down

0 comments on commit 8b651d8

Please sign in to comment.