diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6ecb0ce..602636b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: runs-on: [ 'ubuntu-latest' ] strategy: matrix: - py: [ '3.8', '3.11' ] # minimum required and latest stable + py: [ '3.9', '3.11' ] # minimum required and latest stable steps: - name: Install Python ${{ matrix.py }} @@ -32,7 +32,7 @@ jobs: - name: Run code quality tests (black) uses: psf/black@stable with: - version: "23.3.0" + version: "24.8.0" - name: Run code quality tests (isort) uses: jamescurtin/isort-action@master diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6513f58..479d639 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # NOTE: Keep versions in sync with Github Actions test.yml repos: - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 24.8.0 hooks: - id: black language_version: python3 diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index efb411f..998ef3e 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -35,8 +35,9 @@ def __init__(self, provider, url, reason=None): self.reason = reason def __str__(self): - msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format( - provider=self.provider, url=self.url + msg = ( + f"ERROR: Couldn't figure out {self.provider} URLs from provided " + f"url: {self.url}" ) if self.reason: msg += "\nReason: {reason}".format(reason=self.reason) @@ -53,8 +54,9 @@ def __init__(self, provider, url, reason=None): self.reason = reason def __str__(self): - msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format( - provider=self.provider, url=self.url + msg = ( + f"ERROR: Couldn't determine a filename from {self.url} for " + f"provider {self.provider}" ) if self.reason: msg += "\nReason: {reason}".format(reason=self.reason) @@ -153,6 +155,7 @@ def __str__(self): ) return msg + class FulltextMissingError(Error): """Exception raised when the fulltext PDF can't be found.""" @@ -166,4 +169,4 @@ def __str__(self): f"\t{self.provider}\n" f"\t{self.url}\n" ) - return msg + return msg diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index 8240a62..63c9b24 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -41,7 +41,7 @@ def disable(self): def _log(self, msg, mode, end="\n", add_prefix=True): if not self.enabled: return - if not mode in ("info", "warn"): + if mode not in ("info", "warn"): raise ValueError("Unknown logging mode: %s" % mode) file = sys.stdout if mode == "info" else sys.stderr if add_prefix: diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index e0a67d2..70713a3 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -56,6 +56,13 @@ def blank_pdf(filepath): return output_file +def _filesize_string(size: int) -> str: + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024: + return f"{size:.2f} {unit}" + size /= 1024 + + def shrink_pdf(filepath, gs_path="gs"): """Shrink the PDF file size using Ghostscript""" logger.info("Shrinking pdf file ...") @@ -79,8 +86,14 @@ def shrink_pdf(filepath, gs_path="gs"): if not status == 0: logger.warning("Failed to shrink the pdf file") return filepath + size_after = os.path.getsize(output_file) if size_after > size_before: - logger.info("Shrinking has no effect for this file, using original.") + size_str = _filesize_string(size_before) + logger.info( + f"Shrinking has no effect for this file, using original ({size_str})." + ) return filepath + size_str = _filesize_string(size_after) + logger.info(f"Shrinking brought filesize down to {size_str}") return output_file diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 5afe0b6..04a4925 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -156,6 +156,8 @@ def rewrite_pdf(self, in_file, out_pdf=None): self.gs_path, "-sDEVICE=pdfwrite", "-dQUIET", + "-dWriteXRefStm=false", + "-dWriteObjStms=false", "-o", out_pdf, in_file, diff --git a/paper2remarkable/providers/acl.py b/paper2remarkable/providers/acl.py index 964819f..1232807 100644 --- a/paper2remarkable/providers/acl.py +++ b/paper2remarkable/providers/acl.py @@ -23,10 +23,10 @@ def _format_authors(self, soup_authors): class ACL(Provider): - re_abs_1 = "^https://www.aclweb.org/anthology/(?P[0-9a-zA-Z\.\-]+)" - re_abs_2 = "^https://(www.)?aclanthology.org/(?P[0-9a-zA-Z\.\-]+)" - re_pdf_1 = "^https://www.aclweb.org/anthology/(?P[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf" - re_pdf_2 = "^https://(www.)?aclanthology.org/(?P[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf" + re_abs_1 = r"^https://www.aclweb.org/anthology/(?P[0-9a-zA-Z\.\-]+)" + re_abs_2 = r"^https://(www.)?aclanthology.org/(?P[0-9a-zA-Z\.\-]+)" + re_pdf_1 = r"^https://www.aclweb.org/anthology/(?P[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf" + re_pdf_2 = r"^https://(www.)?aclanthology.org/(?P[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -59,6 +59,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("ACL", url) + @staticmethod def validate(src): return ( re.match(ACL.re_pdf_1, src) diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py index bbe64a4..c914746 100644 --- a/paper2remarkable/providers/acm.py +++ b/paper2remarkable/providers/acm.py @@ -22,20 +22,21 @@ class ACMInformer(Informer): meta_author_key = "citation_authors" def get_title(self, soup): - target = soup.find("h1", {"class": "citation__title"}) + target = soup.find("div", {"class": "core-publication-title"}) return target.text def get_authors(self, soup): authors = [ - a["title"] for a in soup.find_all("a", {"class": "author-name"}) + author_block.find("span", {"property": "familyName"}).text + for author_block in soup.find_all("span", {"property": "author"}) ] - return self._format_authors(authors) + return authors def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) def get_year(self, soup): - date = soup.find("span", {"class": "epub-section__date"}) + date = soup.find("span", {"class": "core-date-published"}) return self._format_year(date.text) def _format_year(self, soup_date): @@ -43,8 +44,8 @@ def _format_year(self, soup_date): class ACM(Provider): - re_abs = "^https?://dl.acm.org/doi/(?P\d+\.\d+/\d+\.\d+)" - re_pdf = "^https?://dl.acm.org/doi/pdf/(?P\d+\.\d+/\d+\.\d+)(\?download=true)?" + re_abs = r"^https?://dl.acm.org/doi/(?P\d+\.\d+/\d+\.\d+)" + re_pdf = r"^https?://dl.acm.org/doi/pdf/(?P\d+\.\d+/\d+\.\d+)(\?download=true)?" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -71,6 +72,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("ACM", url) return abs_url, pdf_url + @staticmethod def validate(src): m = re.match(ACM.re_abs, src) or re.match(ACM.re_pdf, src) - return not m is None + return m is not None diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index a7224ae..eabcbba 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -29,11 +29,11 @@ class ArxivInformer(Informer): class Arxiv(Provider): - re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" - re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + re_abs_1 = r"https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" + re_pdf_1 = r"https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?" - re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?" - re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf" + re_abs_2 = r"https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?" + re_pdf_2 = r"https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?(\.pdf)?" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -50,12 +50,15 @@ def get_abs_pdf_urls(self, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" elif re.match(self.re_pdf_1, url) or re.match(self.re_pdf_2, url): - abs_url = url[:-4].replace("pdf", "abs") + if url.endswith(".pdf"): + url = url[:-4] + abs_url = url.replace("pdf", "abs") pdf_url = url else: raise URLResolutionError("arXiv", url) return abs_url, pdf_url + @staticmethod def validate(src): """Check if the url is to an arXiv page.""" return ( diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py index 38b573a..e837578 100644 --- a/paper2remarkable/providers/citeseerx.py +++ b/paper2remarkable/providers/citeseerx.py @@ -9,7 +9,6 @@ """ import re -import time from ..exceptions import URLResolutionError from ..log import Logger @@ -24,13 +23,14 @@ class CiteSeerXInformer(Informer): meta_date_key = "citation_year" def _format_authors(self, soup_authors): - op = lambda x: x[0].split(",") - return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + return super()._format_authors( + soup_authors, sep=" ", idx=-1, op=lambda x: x[0].split(",") + ) class CiteSeerX(Provider): - re_abs = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P[0-9\.]+)" - re_pdf = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P[0-9\.]+)&rep=rep1&type=pdf" + re_abs = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P[0-9\.]+)" + re_pdf = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P[0-9\.]+)&rep=rep1&type=pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -72,6 +72,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("CiteSeerX", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(CiteSeerX.re_abs, src) or re.match( CiteSeerX.re_pdf, src diff --git a/paper2remarkable/providers/cvf.py b/paper2remarkable/providers/cvf.py index 8ebf24d..b43d845 100644 --- a/paper2remarkable/providers/cvf.py +++ b/paper2remarkable/providers/cvf.py @@ -23,8 +23,8 @@ class CVFInformer(Informer): class CVF(Provider): - re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$" - re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$" + re_abs = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$" + re_pdf = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -43,6 +43,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("CVF", url) return abs_url, pdf_url + @staticmethod def validate(src): m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src) - return not m is None + return m is not None diff --git a/paper2remarkable/providers/diva.py b/paper2remarkable/providers/diva.py index f8c5879..b18eaa1 100644 --- a/paper2remarkable/providers/diva.py +++ b/paper2remarkable/providers/diva.py @@ -8,13 +8,12 @@ """ -import os import re -import urllib.parse import bs4 -from ..exceptions import URLResolutionError, FulltextMissingError +from ..exceptions import FulltextMissingError +from ..exceptions import URLResolutionError from ..log import Logger from ..utils import get_page_with_retry from ._base import Provider @@ -25,18 +24,23 @@ class DiVAInformer(Informer): def get_year(self, soup): - year = soup.find("meta", {"name": "citation_publication_date"}).get("content") + year = soup.find("meta", {"name": "citation_publication_date"}).get( + "content" + ) if not year: logger.warning( - "Couldn't determine year information, maybe provide the desired filename using '--filename'?" + "Couldn't determine year information, maybe provide the " + "desired filename using '--filename'?" ) return "" return year class DiVA(Provider): - re_abs = "^https?://[a-z]+.diva-portal.org/smash/record.jsf" - re_pdf = "^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT" + re_abs = r"^https?://[a-z]+.diva-portal.org/smash/record.jsf" + re_pdf = ( + r"^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT" + ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -48,16 +52,16 @@ def _get_doc_url(self, abs_url): pdf_url = soup.find("meta", {"name": "citation_pdf_url"}) if pdf_url is None: - logger.warning( - "Couldn't find the fulltext URL" - ) + logger.warning("Couldn't find the fulltext URL") raise FulltextMissingError("DiVA", abs_url) - + return pdf_url.get("content") def _get_abs_url(self, pdf_url): diva_id = re.findall("diva2:[0-9]+", pdf_url)[0].split(":")[1] - url_candiate = re.findall("https?://[a-z]+.diva-portal.org/smash/", pdf_url)[0] + url_candiate = re.findall( + "https?://[a-z]+.diva-portal.org/smash/", pdf_url + )[0] url_candiate += "record.jsf?pid=diva2%3A" + diva_id return url_candiate @@ -72,5 +76,6 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("DiVA", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(DiVA.re_abs, src) or re.match(DiVA.re_pdf, src) diff --git a/paper2remarkable/providers/eccc.py b/paper2remarkable/providers/eccc.py index c5435ce..a2e3468 100644 --- a/paper2remarkable/providers/eccc.py +++ b/paper2remarkable/providers/eccc.py @@ -31,7 +31,8 @@ def get_title(self, soup): h4 = divsoup.find("h4") if not h4: logger.warning( - "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + "Couldn't determine title information, maybe provide the " + "desired filename using '--filename'?" ) return "" return h4.get_text().strip() @@ -43,7 +44,8 @@ def get_authors(self, soup): ) if not aa: logger.warning( - "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + "Couldn't determine author information, maybe provide the " + "desired filename using '--filename'?" ) return "" authors = [a.get_text() for a in aa] @@ -56,7 +58,8 @@ def get_year(self, soup): ) if line is None: logger.warning( - "Couldn't determine year information, maybe provide the desired filename using '--filename'?" + "Couldn't determine year information, maybe provide the " + "desired filename using '--filename'?" ) return "" year = line.strip().split(" ")[3] # bit lazy @@ -64,8 +67,8 @@ def get_year(self, soup): class ECCC(Provider): - re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$" - re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$" + re_abs = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$" + re_pdf = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -82,5 +85,6 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("ECCC", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index c4dd21a..7520869 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -145,8 +145,8 @@ def fix_lazy_loading(self, article): # placeholder and the data-src attribute contains the url to the actual # image. Note that results may differ between readability and # Readability.JS - regex = '.*) data-src="(?P.*?)" (?P.*?)>' - sub = ' \g>' + regex = r'.*) data-src="(?P.*?)" (?P.*?)>' + sub = r' \g>' article, nsub = re.subn(regex, sub, article, flags=re.MULTILINE) if nsub: @@ -203,6 +203,7 @@ def retrieve_pdf(self, pdf_url, filename): style = weasyprint.CSS(string=css) html.write_pdf(filename, stylesheets=[style] + font_urls) + @staticmethod def validate(src): # first check if it is a valid url parsed = urllib.parse.urlparse(src) diff --git a/paper2remarkable/providers/iacr.py b/paper2remarkable/providers/iacr.py index 07718d8..ec7ec17 100644 --- a/paper2remarkable/providers/iacr.py +++ b/paper2remarkable/providers/iacr.py @@ -28,7 +28,8 @@ def get_title(self, soup): title = soup.find_all("title") if not title: logger.warning( - "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + "Couldn't determine title information, maybe provide the " + "desired filename using '--filename'?" ) return "" return title[0].get_text() @@ -37,7 +38,8 @@ def get_authors(self, soup): p = soup.find_all("p", {"class": "fst-italic"}) if not p: logger.warning( - "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + "Couldn't determine author information, maybe provide the " + "desired filename using '--filename'?" ) return "" text = p[0].text @@ -51,7 +53,8 @@ def get_year(self, soup): h4 = soup.find("main").find_all("h4") if not h4: logger.warning( - "Couldn't determine year information, maybe provide the desired filename using '--filename'?" + "Couldn't determine year information, maybe provide the " + "desired filename using '--filename'?" ) return "" text = h4[0].get_text() @@ -62,9 +65,9 @@ def get_year(self, soup): class IACR(Provider): - re_abs = "https?://eprint.iacr.org/\d{4}/\d+$" - re_pdf = "https?://eprint.iacr.org/\d{4}/\d+\.pdf$" - re_ps = "https?://eprint.iacr.org/\d{4}/\d+\.ps$" + re_abs = r"https?://eprint.iacr.org/\d{4}/\d+$" + re_pdf = r"https?://eprint.iacr.org/\d{4}/\d+\.pdf$" + re_ps = r"https?://eprint.iacr.org/\d{4}/\d+\.ps$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -83,12 +86,28 @@ def _get_doc_url(self, abs_url): return abs_url + ".pdf" dd = dt.find_next_sibling("dd") aa = dd.find_all("a") - a = next((a for a in aa if "PDF" in a.get_text()), None) - if not a is None: - return urllib.parse.urljoin(abs_url, a.get("href")) - a = next((a for a in aa if "PS" in a.get_text()), None) - if not a is None: - return urllib.parse.urljoin(abs_url, a.get("href")) + pdf_tag = next( + ( + a + for a in aa + if "PDF" in a.get_text() + and a.get("href").lower().endswith(".pdf") + ), + None, + ) + if pdf_tag is not None: + return urllib.parse.urljoin(abs_url, pdf_tag.get("href")) + ps_tag = next( + ( + a + for a in aa + if "PS" in a.get_text() + and a.get("href").lower().endswith(".ps") + ), + None, + ) + if ps_tag is not None: + return urllib.parse.urljoin(abs_url, ps_tag.get("href")) # Fallback return abs_url + ".pdf" @@ -112,5 +131,6 @@ def retrieve_pdf(self, pdf_url, filename): super().retrieve_pdf(pdf_url, tmpfilename) self.rewrite_pdf(tmpfilename, out_pdf=filename) + @staticmethod def validate(src): return re.match(IACR.re_abs, src) or re.match(IACR.re_pdf, src) diff --git a/paper2remarkable/providers/jmlr.py b/paper2remarkable/providers/jmlr.py index efc3e28..8af6d12 100644 --- a/paper2remarkable/providers/jmlr.py +++ b/paper2remarkable/providers/jmlr.py @@ -28,11 +28,11 @@ def _format_authors(self, soup_authors): class JMLR(Provider): - re_abs_1 = "https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\d{2}\-\d{3}).html$" - re_pdf_1 = "https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\d{2}\-\d{3})/(?P=pid).pdf$" + re_abs_1 = r"https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\d{2}\-\d{3}).html$" + re_pdf_1 = r"https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\d{2}\-\d{3})/(?P=pid).pdf$" - re_abs_2 = "https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\w+\d{2}\w).html$" - re_pdf_2 = "https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\w+\d{2}\w)/(?P=pid).pdf$" + re_abs_2 = r"https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\w+\d{2}\w).html$" + re_pdf_2 = r"https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\w+\d{2}\w)/(?P=pid).pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -64,6 +64,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("JMLR", url) return abs_url, pdf_url + @staticmethod def validate(src): return ( re.match(JMLR.re_abs_1, src) diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py index 3f581b2..aaa11f9 100644 --- a/paper2remarkable/providers/local.py +++ b/paper2remarkable/providers/local.py @@ -30,6 +30,7 @@ def get_abs_pdf_urls(self, url): # pdf_url. return url, url + @staticmethod def validate(src): return os.path.exists(src) diff --git a/paper2remarkable/providers/nature.py b/paper2remarkable/providers/nature.py index 48ebe7c..24a2c06 100644 --- a/paper2remarkable/providers/nature.py +++ b/paper2remarkable/providers/nature.py @@ -23,8 +23,8 @@ def _format_authors(self, soup_authors): class Nature(Provider): - re_abs = "^https://www.nature.com/articles/s[a-z0-9\-]+$" - re_pdf = "^https://www.nature.com/articles/s[a-z0-9\-]+\.pdf$" + re_abs = r"^https://www.nature.com/articles/s[a-z0-9\-]+$" + re_pdf = r"^https://www.nature.com/articles/s[a-z0-9\-]+\.pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -41,5 +41,6 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("Nature", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(Nature.re_abs, src) or re.match(Nature.re_pdf, src) diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py index 7909cec..1df9c64 100644 --- a/paper2remarkable/providers/nber.py +++ b/paper2remarkable/providers/nber.py @@ -25,10 +25,9 @@ def _format_authors(self, soup_authors, sep=" ", idx=0, op=None): class NBER(Provider): - re_abs = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)$" - re_pdf = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)\.pdf$" - - re_pdf_2 = "https://www.nber.org/system/files/working_papers/(?P[a-z0-9]+)/(?P=ref).pdf" + re_abs = r"https?://www\.nber\.org/papers/(?P[a-z0-9]+)$" + re_pdf = r"https?://www\.nber\.org/papers/(?P[a-z0-9]+)\.pdf$" + re_pdf_2 = r"https://www.nber.org/system/files/working_papers/(?P[a-z0-9]+)/(?P=ref).pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -57,6 +56,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("NBER", url) return abs_url, pdf_url + @staticmethod def validate(src): return ( re.match(NBER.re_abs, src) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index b9061bc..3b68a43 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -20,7 +20,7 @@ class OpenReviewInformer(Informer): - meta_date_key = "citation_publication_date" + meta_date_key = "citation_online_date" def get_authors(self, soup): # Get the authors for OpenReview by parsing the JSON payload @@ -55,8 +55,8 @@ def _format_authors(self, soup_authors): class OpenReview(Provider): - re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" - re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" + re_abs = r"https?://openreview.net/forum\?id=[A-Za-z0-9]+" + re_pdf = r"https?://openreview.net/pdf\?id=[A-Za-z0-9]+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -74,6 +74,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("OpenReview", url) return abs_url, pdf_url + @staticmethod def validate(src): """Check if the url is a valid OpenReview url.""" return re.match(OpenReview.re_abs, src) or re.match( diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 520bf2b..4a58f10 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -56,6 +56,7 @@ def __init__(self, *args, **kwargs): def get_abs_pdf_urls(self, url): return (url, url) + @staticmethod def validate(src): # first check if it is a valid url parsed = urllib.parse.urlparse(src) diff --git a/paper2remarkable/providers/pmlr.py b/paper2remarkable/providers/pmlr.py index 391f4ab..a208577 100644 --- a/paper2remarkable/providers/pmlr.py +++ b/paper2remarkable/providers/pmlr.py @@ -23,11 +23,11 @@ def _format_authors(self, soup_authors): class PMLR(Provider): - re_abs_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.html" - re_pdf_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.pdf" + re_abs_1 = r"https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.html" + re_pdf_1 = r"https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.pdf" - re_abs_2 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+\w?.html" - re_pdf_2 = "https?://proceedings.mlr.press/v\d+/(?P[\w\-\w]+\d+\w?)/(?P=ref).pdf" + re_abs_2 = r"https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+\w?.html" + re_pdf_2 = r"https?://proceedings.mlr.press/v\d+/(?P[\w\-\w]+\d+\w?)/(?P=ref).pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -57,6 +57,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("PMLR", url) return abs_url, pdf_url + @staticmethod def validate(src): return ( re.fullmatch(PMLR.re_abs_1, src) diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py index 0fe7fd4..dd5ff6d 100644 --- a/paper2remarkable/providers/pubmed.py +++ b/paper2remarkable/providers/pubmed.py @@ -24,9 +24,9 @@ def _format_authors(self, soup_authors): class PubMed(Provider): - re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" + re_abs = r"https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" re_pdf = ( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" + r"https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" ) def __init__(self, *args, **kwargs): @@ -46,5 +46,6 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("PMC", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src) diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py index 659b3a2..6c1dc55 100644 --- a/paper2remarkable/providers/sagepub.py +++ b/paper2remarkable/providers/sagepub.py @@ -28,8 +28,8 @@ def _format_year(self, soup_date): class SagePub(Provider): - re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+" - re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+" + re_abs = r"https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+" + re_pdf = r"https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -46,5 +46,6 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("SagePub", url) return abs_url, pdf_url + @staticmethod def validate(src): return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src) diff --git a/paper2remarkable/providers/science_direct.py b/paper2remarkable/providers/science_direct.py index 384489a..27df60b 100644 --- a/paper2remarkable/providers/science_direct.py +++ b/paper2remarkable/providers/science_direct.py @@ -35,7 +35,8 @@ def get_authors(self, soup): surname_tags = soup.find_all("span", attrs={"class": "text surname"}) if not surname_tags: logger.warning( - "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + "Couldn't determine author information, maybe provide the " + "desired filename using '--filename'?" ) return "" authors = [x.text for x in surname_tags] @@ -44,9 +45,9 @@ def get_authors(self, soup): class ScienceDirect(Provider): re_abs = ( - "https?:\/\/www.sciencedirect.com/science/article/pii/[A-Za-z0-9]+" + r"https?:\/\/www.sciencedirect.com/science/article/pii/[A-Za-z0-9]+" ) - re_pdf = "https://pdf.sciencedirectassets.com/\d+/([0-9a-zA-Z\-\.]+)/(?P[0-9a-zA-Z\-\.]+)/main.pdf\?.*" + re_pdf = r"https://pdf.sciencedirectassets.com/\d+/([0-9a-zA-Z\-\.]+)/(?P[0-9a-zA-Z\-\.]+)/main.pdf\?.*" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -82,16 +83,16 @@ def _get_pdf_url(self, url): raise URLResolutionError("ScienceDirect", url) json_data = scripts[0].string data = json.loads(json_data) - if not "article" in data: + if "article" not in data: raise URLResolutionError("ScienceDirect", url) data = data["article"] - if not "pdfDownload" in data: + if "pdfDownload" not in data: raise URLResolutionError("ScienceDirect", url) data = data["pdfDownload"] - if not "urlMetadata" in data: + if "urlMetadata" not in data: raise URLResolutionError("ScienceDirect", url) meta = data["urlMetadata"] @@ -164,6 +165,7 @@ def _get_pdf_url(self, url): pdf_url = a[0].get("href") return pdf_url + @staticmethod def validate(src): return re.match(ScienceDirect.re_abs, src) or re.match( ScienceDirect.re_pdf, src diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py index 3dd7b41..fff146c 100644 --- a/paper2remarkable/providers/semantic_scholar.py +++ b/paper2remarkable/providers/semantic_scholar.py @@ -27,9 +27,7 @@ def _format_authors(self, soup_authors): class SemanticScholar(Provider): - re_abs = ( - "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" - ) + re_abs = r"https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -60,12 +58,18 @@ def _get_pdf_url(self, url): return a["href"] # Next try to get the url from the metadata (not always a pdf) - meta = soup.find_all("meta", {"name": "citation_pdf_url"}) - if not meta: + meta_1 = soup.find_all("meta", {"property": "citation_pdf_url"}) + meta_2 = soup.find_all("meta", {"name": "citation_pdf_url"}) + + if not (meta_1 or meta_2): raise URLResolutionError( "SemanticScholar", url, reason="Page has no url to PDF file" ) - pdf_url = meta[0]["content"] + + if meta_1: + pdf_url = meta_1[0]["content"] + else: + pdf_url = meta_2[0]["content"] # Check the content type to check that the data will be a pdf content_type = get_content_type_with_retry(pdf_url) @@ -83,5 +87,6 @@ def _get_pdf_url(self, url): ) return pdf_url + @staticmethod def validate(src): return re.match(SemanticScholar.re_abs, src) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index 088be41..1494874 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -35,9 +35,9 @@ def get_year(self, soup): class Springer(Provider): - re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" + re_abs_1 = r"https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_abs_2 = r"https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = r"https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -75,6 +75,7 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("Springer", url) return abs_url, pdf_url + @staticmethod def validate(src): return ( re.match(Springer.re_abs_1, src) diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py index 108de32..5574cbb 100644 --- a/paper2remarkable/providers/tandfonline.py +++ b/paper2remarkable/providers/tandfonline.py @@ -31,8 +31,8 @@ def _format_year(self, soup_date): class TandFOnline(Provider): - re_abs = "^https?://\w+.tandfonline.com/doi/(full|abs)/(?P\d+\.\d+/\w+\.\w+\.\w+)" - re_pdf = "^https?://\w+.tandfonline.com/doi/(full|pdf)/(?P\d+\.\d+/\w+\.\w+\.\w+)" + re_abs = r"^https?://\w+.tandfonline.com/doi/(full|abs)/(?P\d+\.\d+/\w+\.\w+\.\w+)" + re_pdf = r"^https?://\w+.tandfonline.com/doi/(full|pdf)/(?P\d+\.\d+/\w+\.\w+\.\w+)" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -66,8 +66,9 @@ def get_abs_pdf_urls(self, url): raise URLResolutionError("TandFOnline", url) return abs_url, pdf_url + @staticmethod def validate(src): m = re.match(TandFOnline.re_abs, src) or re.match( TandFOnline.re_pdf, src ) - return not m is None + return m is not None diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index efed7f9..065ac9f 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -225,13 +225,13 @@ def merge_options(args, config=None): def set_bool(d, key, value, invert=False): if value: d[key] = True ^ invert - elif not key in d: + elif key not in d: d[key] = False ^ invert def set_path(d, key, value): - if not value is None: + if value is not None: d[key] = value - elif not key in d: + elif key not in d: d[key] = key set_bool(opts["core"], "blank", args.blank) @@ -245,12 +245,12 @@ def set_path(d, key, value): opts["core"]["crop"] = "right" elif args.no_crop: opts["core"]["crop"] = "none" - elif not "crop" in opts["core"]: + elif "crop" not in opts["core"]: opts["core"]["crop"] = "left" if args.remarkable_dir is not None: opts["core"]["remarkable_dir"] = args.remarkable_dir - elif not "remarkable_dir" in opts["core"]: + elif "remarkable_dir" not in opts["core"]: opts["core"]["remarkable_dir"] = "/" set_path(opts["system"], "gs", args.gs) @@ -263,14 +263,14 @@ def set_path(d, key, value): with open(args.css, "r") as fp: contents = fp.read() opts["html"]["css"] = contents - elif not "css" in opts["html"]: + elif "css" not in opts["html"]: opts["html"]["css"] = None if args.font_urls and os.path.exists(args.font_urls): with open(args.font_urls, "r") as fp: - urls = [l.strip() for l in fp.readlines()] + urls = [line.strip() for line in fp.readlines()] opts["html"]["font_urls"] = urls - elif not "font_urls" in opts["html"]: + elif "font_urls" not in opts["html"]: opts["html"]["font_urls"] = None return opts diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 01fca14..5f4905a 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -32,6 +32,7 @@ "Safari/537.36" } +HTTP_SERVICE_UNAVAILABLE = 503 logger = Logger() @@ -73,6 +74,7 @@ def download_url(url, filename, cookiejar=None): def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): count = 0 + res = None jar = {} if cookiejar is None else cookiejar while count < tries: count += 1 @@ -81,11 +83,14 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): res = requests.get(url, headers=HEADERS, cookies=jar) except requests.exceptions.ConnectionError: error = True + if ( - res.status_code == 503 + res is not None + and res.status_code == HTTP_SERVICE_UNAVAILABLE and res.headers.get("server", "") == "cloudflare" ): raise BlockedByCloudFlareError(url) + if error or not res.ok: logger.warning( "(%i/%i) Error getting url %s. Retrying in 5 seconds." @@ -93,6 +98,7 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): ) time.sleep(5) continue + logger.info("Downloaded url: %s" % url) if return_text: return res.text @@ -184,7 +190,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): def is_url(string): # pattern adapted from CleverCSV - pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*[\-\_]?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" + pattern = r"((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*[\-\_]?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None diff --git a/setup.py b/setup.py index bbce7f5..81eb373 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ LICENSE = "MIT" LICENSE_TROVE = "License :: OSI Approved :: MIT License" NAME = "paper2remarkable" -REQUIRES_PYTHON = ">=3.6.0" +REQUIRES_PYTHON = ">=3.9.0" URL = "https://github.com/GjjvdBurg/paper2remarkable" VERSION = None @@ -100,7 +100,10 @@ LICENSE_TROVE, "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Intended Audience :: Education", diff --git a/tests/test_html.py b/tests/test_html.py index 05b8a54..cb32b51 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -48,7 +48,7 @@ def test_custom_css(self): prov = HTML(upload=False, css=test_css, font_urls=test_font_urls) filename = prov.run(url) with pdfplumber.open(filename) as pdf: - self.assertEqual(9, len(pdf.pages)) + self.assertEqual(7, len(pdf.pages)) os.unlink(filename) diff --git a/tests/test_providers.py b/tests/test_providers.py index 3c9cb4f..47e4b9d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -__author__ = "G.J.J. van den Burg" - """Tests""" import hashlib @@ -16,11 +14,11 @@ from _constants import TEST_FILE from pikepdf import Pdf -from paper2remarkable.exceptions import URLResolutionError, FulltextMissingError +from paper2remarkable.exceptions import FulltextMissingError +from paper2remarkable.exceptions import URLResolutionError from paper2remarkable.providers import ACL from paper2remarkable.providers import ACM from paper2remarkable.providers import CVF -from paper2remarkable.providers import DiVA from paper2remarkable.providers import ECCC from paper2remarkable.providers import HTML from paper2remarkable.providers import IACR @@ -29,6 +27,7 @@ from paper2remarkable.providers import PMLR from paper2remarkable.providers import Arxiv from paper2remarkable.providers import CiteSeerX +from paper2remarkable.providers import DiVA from paper2remarkable.providers import LocalFile from paper2remarkable.providers import Nature from paper2remarkable.providers import NeurIPS @@ -146,13 +145,20 @@ def test_acm_3(self): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_openreview(self): + def test_openreview_1(self): prov = OpenReview(upload=False, verbose=VERBOSE) url = "https://openreview.net/forum?id=S1x4ghC9tQ" exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_openreview_2(self): + prov = OpenReview(upload=False, verbose=VERBOSE) + url = "https://openreview.net/pdf?id=PlGSgjFK2oJ" + exp_filename = "Burg_Williams_-_On_Memorization_in_Probabilistic_Deep_Generative_Models_2021.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" @@ -564,16 +570,19 @@ def test_diva_1(self): def test_diva_2(self): # Testing absolute URLs and sanitization of filenames prov = DiVA(upload=False, verbose=VERBOSE) - url = "https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1480467" + url = ( + "https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1480467" + ) exp = "Alhussein_-_Privacy_by_Design_Amp_Internet_of_Things_Managing_Privacy_2018.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) - + def test_diva_3(self): # Testing older entries without available fulltext prov = DiVA(upload=False, verbose=VERBOSE) url = "https://uu.diva-portal.org/smash/record.jsf?pid=diva2%3A59234" self.assertRaises(FulltextMissingError, prov.run, url) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ui.py b/tests/test_ui.py index 2d97e3d..2451f9d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -72,7 +72,7 @@ def test_choose_provider_1(self): ( Arxiv, "https://arxiv.org/pdf/physics/0605197v1.pdf", - "https://arxiv.org/pdf/physics/0605197v1.pdf", + "http://arxiv.org/pdf/physics/0605197v1", ), ( PubMed, @@ -374,8 +374,8 @@ def test_merge_options_5(self): with self.subTest(s): test_sys(s) - self.assertEquals(opts["html"]["css"], "Hello, World!\n") - self.assertEquals(opts["html"]["font_urls"], ["url_1", "url_2"]) + self.assertEqual(opts["html"]["css"], "Hello, World!\n") + self.assertEqual(opts["html"]["font_urls"], ["url_1", "url_2"]) def test_runner_1(self): inputs = [