diff --git a/docs/man.md b/docs/man.md index 132d896..db7d600 100644 --- a/docs/man.md +++ b/docs/man.md @@ -71,6 +71,18 @@ reMarkable options: If the target directory does not exist it will be created. If not specified, the root directory will be used. +Output customization: + +--css=FILENAME + Path to a CSS file with custom styling for the HTML output. This option + is ignored for any of the other providers. The code for the HTML + provider contains the default CSS style, which can be used as a starting + point. + +--font-urls=FILENAME + Path to a file with font urls (one per line) for the HTML output. This + will generally be used in combination with the ``--css`` option. + System settings: You'll only need to specify these options if the programs are not available on diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index fb9d8a3..8240a62 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -9,8 +9,8 @@ """ # NOTE: I know about the logging module, but this was easier because one of the -# dependencies was using that and it became complicated. This one is obviously -# not thread-safe and is very simple. +# dependencies was using that and it interfered with our logging. The logger +# class below is obviously not thread-safe and is very simple. import datetime import sys diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index a664f23..56ffa31 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -49,6 +49,8 @@ def __init__( pdftk_path="pdftk", qpdf_path="qpdf", gs_path="gs", + css_path=None, + font_urls_path=None, cookiejar=None, ): self.upload = upload @@ -60,9 +62,12 @@ def __init__( self.pdftk_path = pdftk_path self.qpdf_path = qpdf_path self.gs_path = gs_path - self.informer = Informer() + self.css_path = css_path + self.font_urls_path = font_urls_path self.cookiejar = cookiejar + self.informer = Informer() + self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path) # wait time to not hit the server too frequently diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 3e32539..48ede10 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -13,13 +13,13 @@ import html2text import markdown +import os import re import readability import titlecase import unidecode import urllib import weasyprint -import weasyprint.fonts from ._base import Provider from ._info import Informer @@ -34,7 +34,6 @@ logger = Logger() CSS = """ -@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap'); @page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } @@ -48,6 +47,13 @@ code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; } """ +# NOTE: For some reason, Weasyprint no longer accepts the @import statement in +# the CSS to load the fonts. This may have to do with recent changes they've +# introduced. Providing the font urls separately does seem to work. +FONT_URLS = [ + "https://fonts.googleapis.com/css2?family=EB+Garamond&family=Noto+Serif&family=Inconsolata" +] + def url_fetcher(url): if url.startswith("//"): @@ -168,6 +174,30 @@ def preprocess_html(self, pdf_url, title, article): html_article = md.convert(article) return html_article + def get_css(self): + if self.css_path is None: + return CSS + if not os.path.exists(self.css_path): + logger.warning( + f"CSS file {self.css_path} doesn't exist, using default style." + ) + return CSS + with open(self.css_path, "r") as fp: + css = fp.read() + return css + + def get_font_urls(self): + if self.font_urls_path is None: + return FONT_URLS + if not os.path.exists(self.font_urls_path): + logger.warning( + f"Font urls file {self.font_urls_path} doesn't exist, using default." + ) + return FONT_URLS + with open(self.font_urls_path, "r") as fp: + font_urls = [l.strip() for l in fp.read().split("\n")] + return font_urls + def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file @@ -193,11 +223,11 @@ def retrieve_pdf(self, pdf_url, filename): with open("./paper.html", "w") as fp: fp.write(html_article) - font_config = weasyprint.fonts.FontConfiguration() html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher) - css = weasyprint.CSS(string=CSS, font_config=font_config) - - html.write_pdf(filename, stylesheets=[css], font_config=font_config) + css = self.get_css() + font_urls = self.get_font_urls() + style = weasyprint.CSS(string=css) + html.write_pdf(filename, stylesheets=[style] + font_urls) def validate(src): # first check if it is a valid url diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index f9af28f..095b69a 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -107,6 +107,14 @@ def parse_args(): help="path to rmapi executable (default: rmapi)", default="rmapi", ) + parser.add_argument( + "--css", help="path to custom CSS file for HTML output", default=None + ) + parser.add_argument( + "--font-urls", + help="path to custom font urls file for HTML output", + default=None, + ) parser.add_argument( "input", help="One or more URLs to a paper or paths to local PDF files", @@ -229,6 +237,8 @@ def main(): pdftk_path=args.pdftk, qpdf_path=args.qpdf, gs_path=args.gs, + css_path=args.css, + font_urls_path=args.font_urls, cookiejar=cookiejar, ) prov.run(new_input, filename=filename) diff --git a/tests/test_html.py b/tests/test_html.py index d271bb5..7d5c92b 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -7,6 +7,9 @@ """ +import os +import pdfplumber +import tempfile import unittest from paper2remarkable.providers.html import HTML @@ -24,6 +27,38 @@ def test_experimental_fix_lazy_loading(self): expected_image = "https://www.seriouseats.com/images/2015/01/20150118-tea-max-falkowitz-3.jpg" self.assertIn(expected_image, html_article) + def test_custom_css(self): + test_css = """ + @page { size: 702px 936px; margin: 1in; } + img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } + h1,h2,h3 { font-family: 'Montserrat'; } + p, li { font-size: 12pt; line-height: 2; font-family: 'Montserrat'; text-align: left; } + """ + + test_font_urls = [ + "https://fonts.googleapis.com/css2?family=Montserrat&display=swap" + ] + + tmpfd, tempfname_css = tempfile.mkstemp(prefix="p2r_", suffix=".css") + with os.fdopen(tmpfd, "w") as fp: + fp.write(test_css) + + tmpfd, tempfname_urls = tempfile.mkstemp(prefix="p2r_", suffix=".txt") + with os.fdopen(tmpfd, "w") as fp: + fp.write("\n".join(test_font_urls)) + + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + prov = HTML( + upload=False, css_path=tempfname_css, font_urls_path=tempfname_urls + ) + filename = prov.run(url) + with pdfplumber.open(filename) as pdf: + self.assertEqual(8, len(pdf.pages)) + + os.unlink(tempfname_css) + os.unlink(tempfname_urls) + os.unlink(filename) + if __name__ == "__main__": unittest.main()