Fixes for various providers (#146)

* Fix broken tests * Code style and formatting * Fixes for various providers * Show debug info for resizing * more code formatting * Bump black version and reformat * Bump minimum Python version to 3.9
GjjvdBurg · Aug 12, 2024 · dd1a5b8 · dd1a5b8
1 parent c645528
commit dd1a5b8
Show file tree

Hide file tree

Showing 34 changed files with 218 additions and 127 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: [ 'ubuntu-latest' ]
     strategy:
       matrix:
-        py: [ '3.8', '3.11' ] # minimum required and latest stable
+        py: [ '3.9', '3.11' ] # minimum required and latest stable
 
     steps:
       - name: Install Python ${{ matrix.py }}
@@ -32,7 +32,7 @@ jobs:
       - name: Run code quality tests (black)
         uses: psf/black@stable
         with:
-          version: "23.3.0"
+          version: "24.8.0"
 
       - name: Run code quality tests (isort)
         uses: jamescurtin/isort-action@master

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 # NOTE: Keep versions in sync with Github Actions test.yml
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.3.0
+    rev: 24.8.0
     hooks:
       - id: black
         language_version: python3

diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
@@ -35,8 +35,9 @@ def __init__(self, provider, url, reason=None):
         self.reason = reason
 
     def __str__(self):
-        msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format(
-            provider=self.provider, url=self.url
+        msg = (
+            f"ERROR: Couldn't figure out {self.provider} URLs from provided "
+            f"url: {self.url}"
         )
         if self.reason:
             msg += "\nReason: {reason}".format(reason=self.reason)
@@ -53,8 +54,9 @@ def __init__(self, provider, url, reason=None):
         self.reason = reason
 
     def __str__(self):
-        msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format(
-            provider=self.provider, url=self.url
+        msg = (
+            f"ERROR: Couldn't determine a filename from {self.url} for "
+            f"provider {self.provider}"
         )
         if self.reason:
             msg += "\nReason: {reason}".format(reason=self.reason)
@@ -153,6 +155,7 @@ def __str__(self):
         )
         return msg
 
+
 class FulltextMissingError(Error):
     """Exception raised when the fulltext PDF can't be found."""
 
@@ -166,4 +169,4 @@ def __str__(self):
             f"\t{self.provider}\n"
             f"\t{self.url}\n"
         )
-        return msg 
+        return msg
diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py
@@ -41,7 +41,7 @@ def disable(self):
     def _log(self, msg, mode, end="\n", add_prefix=True):
         if not self.enabled:
             return
-        if not mode in ("info", "warn"):
+        if mode not in ("info", "warn"):
             raise ValueError("Unknown logging mode: %s" % mode)
         file = sys.stdout if mode == "info" else sys.stderr
         if add_prefix:

diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
@@ -56,6 +56,13 @@ def blank_pdf(filepath):
     return output_file
 
 
+def _filesize_string(size: int) -> str:
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if size < 1024:
+            return f"{size:.2f} {unit}"
+        size /= 1024
+
+
 def shrink_pdf(filepath, gs_path="gs"):
     """Shrink the PDF file size using Ghostscript"""
     logger.info("Shrinking pdf file ...")
@@ -79,8 +86,14 @@ def shrink_pdf(filepath, gs_path="gs"):
     if not status == 0:
         logger.warning("Failed to shrink the pdf file")
         return filepath
+
     size_after = os.path.getsize(output_file)
     if size_after > size_before:
-        logger.info("Shrinking has no effect for this file, using original.")
+        size_str = _filesize_string(size_before)
+        logger.info(
+            f"Shrinking has no effect for this file, using original ({size_str})."
+        )
         return filepath
+    size_str = _filesize_string(size_after)
+    logger.info(f"Shrinking brought filesize down to {size_str}")
     return output_file
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
@@ -156,6 +156,8 @@ def rewrite_pdf(self, in_file, out_pdf=None):
                 self.gs_path,
                 "-sDEVICE=pdfwrite",
                 "-dQUIET",
+                "-dWriteXRefStm=false",
+                "-dWriteObjStms=false",
                 "-o",
                 out_pdf,
                 in_file,

diff --git a/paper2remarkable/providers/acl.py b/paper2remarkable/providers/acl.py
@@ -23,10 +23,10 @@ def _format_authors(self, soup_authors):
 
 
 class ACL(Provider):
-    re_abs_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
-    re_abs_2 = "^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]+)"
-    re_pdf_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
-    re_pdf_2 = "^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
+    re_abs_1 = r"^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
+    re_abs_2 = r"^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]+)"
+    re_pdf_1 = r"^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
+    re_pdf_2 = r"^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -59,6 +59,7 @@ def get_abs_pdf_urls(self, url):
 
         raise URLResolutionError("ACL", url)
 
+    @staticmethod
     def validate(src):
         return (
             re.match(ACL.re_pdf_1, src)

diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
@@ -22,29 +22,30 @@ class ACMInformer(Informer):
     meta_author_key = "citation_authors"
 
     def get_title(self, soup):
-        target = soup.find("h1", {"class": "citation__title"})
+        target = soup.find("div", {"class": "core-publication-title"})
         return target.text
 
     def get_authors(self, soup):
         authors = [
-            a["title"] for a in soup.find_all("a", {"class": "author-name"})
+            author_block.find("span", {"property": "familyName"}).text
+            for author_block in soup.find_all("span", {"property": "author"})
         ]
-        return self._format_authors(authors)
+        return authors
 
     def _format_authors(self, soup_authors):
         return super()._format_authors(soup_authors, sep=" ", idx=-1)
 
     def get_year(self, soup):
-        date = soup.find("span", {"class": "epub-section__date"})
+        date = soup.find("span", {"class": "core-date-published"})
         return self._format_year(date.text)
 
     def _format_year(self, soup_date):
         return soup_date.strip().split(" ")[-1].strip()
 
 
 class ACM(Provider):
-    re_abs = "^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
-    re_pdf = "^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)(\?download=true)?"
+    re_abs = r"^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
+    re_pdf = r"^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)(\?download=true)?"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -71,6 +72,7 @@ def get_abs_pdf_urls(self, url):
             raise URLResolutionError("ACM", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         m = re.match(ACM.re_abs, src) or re.match(ACM.re_pdf, src)
-        return not m is None
+        return m is not None
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
@@ -29,11 +29,11 @@ class ArxivInformer(Informer):
 
 
 class Arxiv(Provider):
-    re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
-    re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
+    re_abs_1 = r"https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
+    re_pdf_1 = r"https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?"
 
-    re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
-    re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf"
+    re_abs_2 = r"https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
+    re_pdf_2 = r"https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?(\.pdf)?"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -50,12 +50,15 @@ def get_abs_pdf_urls(self, url):
             abs_url = url
             pdf_url = url.replace("abs", "pdf") + ".pdf"
         elif re.match(self.re_pdf_1, url) or re.match(self.re_pdf_2, url):
-            abs_url = url[:-4].replace("pdf", "abs")
+            if url.endswith(".pdf"):
+                url = url[:-4]
+            abs_url = url.replace("pdf", "abs")
             pdf_url = url
         else:
             raise URLResolutionError("arXiv", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         """Check if the url is to an arXiv page."""
         return (

diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py
@@ -9,7 +9,6 @@
 """
 
 import re
-import time
 
 from ..exceptions import URLResolutionError
 from ..log import Logger
@@ -24,13 +23,14 @@ class CiteSeerXInformer(Informer):
     meta_date_key = "citation_year"
 
     def _format_authors(self, soup_authors):
-        op = lambda x: x[0].split(",")
-        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+        return super()._format_authors(
+            soup_authors, sep=" ", idx=-1, op=lambda x: x[0].split(",")
+        )
 
 
 class CiteSeerX(Provider):
-    re_abs = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
-    re_pdf = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"
+    re_abs = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
+    re_pdf = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -72,6 +72,7 @@ def get_abs_pdf_urls(self, url):
             raise URLResolutionError("CiteSeerX", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         return re.match(CiteSeerX.re_abs, src) or re.match(
             CiteSeerX.re_pdf, src

diff --git a/paper2remarkable/providers/cvf.py b/paper2remarkable/providers/cvf.py
@@ -23,8 +23,8 @@ class CVFInformer(Informer):
 
 
 class CVF(Provider):
-    re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
-    re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"
+    re_abs = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
+    re_pdf = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -43,6 +43,7 @@ def get_abs_pdf_urls(self, url):
             raise URLResolutionError("CVF", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src)
-        return not m is None
+        return m is not None
diff --git a/paper2remarkable/providers/diva.py b/paper2remarkable/providers/diva.py
@@ -8,13 +8,12 @@
 
 """
 
-import os
 import re
-import urllib.parse
 
 import bs4
 
-from ..exceptions import URLResolutionError, FulltextMissingError
+from ..exceptions import FulltextMissingError
+from ..exceptions import URLResolutionError
 from ..log import Logger
 from ..utils import get_page_with_retry
 from ._base import Provider
@@ -25,18 +24,23 @@
 
 class DiVAInformer(Informer):
     def get_year(self, soup):
-        year = soup.find("meta", {"name": "citation_publication_date"}).get("content")
+        year = soup.find("meta", {"name": "citation_publication_date"}).get(
+            "content"
+        )
         if not year:
             logger.warning(
-                "Couldn't determine year information, maybe provide the desired filename using '--filename'?"
+                "Couldn't determine year information, maybe provide the "
+                "desired filename using '--filename'?"
             )
             return ""
         return year
 
 
 class DiVA(Provider):
-    re_abs = "^https?://[a-z]+.diva-portal.org/smash/record.jsf"
-    re_pdf = "^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"
+    re_abs = r"^https?://[a-z]+.diva-portal.org/smash/record.jsf"
+    re_pdf = (
+        r"^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"
+    )
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -48,16 +52,16 @@ def _get_doc_url(self, abs_url):
 
         pdf_url = soup.find("meta", {"name": "citation_pdf_url"})
         if pdf_url is None:
-            logger.warning(
-                "Couldn't find the fulltext URL"
-            )
+            logger.warning("Couldn't find the fulltext URL")
             raise FulltextMissingError("DiVA", abs_url)
-        
+
         return pdf_url.get("content")
 
     def _get_abs_url(self, pdf_url):
         diva_id = re.findall("diva2:[0-9]+", pdf_url)[0].split(":")[1]
-        url_candiate = re.findall("https?://[a-z]+.diva-portal.org/smash/", pdf_url)[0]
+        url_candiate = re.findall(
+            "https?://[a-z]+.diva-portal.org/smash/", pdf_url
+        )[0]
         url_candiate += "record.jsf?pid=diva2%3A" + diva_id
         return url_candiate
 
@@ -72,5 +76,6 @@ def get_abs_pdf_urls(self, url):
             raise URLResolutionError("DiVA", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         return re.match(DiVA.re_abs, src) or re.match(DiVA.re_pdf, src)
diff --git a/paper2remarkable/providers/eccc.py b/paper2remarkable/providers/eccc.py
@@ -31,7 +31,8 @@ def get_title(self, soup):
         h4 = divsoup.find("h4")
         if not h4:
             logger.warning(
-                "Couldn't determine title information, maybe provide the desired filename using '--filename'?"
+                "Couldn't determine title information, maybe provide the "
+                "desired filename using '--filename'?"
             )
             return ""
         return h4.get_text().strip()
@@ -43,7 +44,8 @@ def get_authors(self, soup):
         )
         if not aa:
             logger.warning(
-                "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+                "Couldn't determine author information, maybe provide the "
+                "desired filename using '--filename'?"
             )
             return ""
         authors = [a.get_text() for a in aa]
@@ -56,16 +58,17 @@ def get_year(self, soup):
         )
         if line is None:
             logger.warning(
-                "Couldn't determine year information, maybe provide the desired filename using '--filename'?"
+                "Couldn't determine year information, maybe provide the "
+                "desired filename using '--filename'?"
             )
             return ""
         year = line.strip().split(" ")[3]  # bit lazy
         return year
 
 
 class ECCC(Provider):
-    re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
-    re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"
+    re_abs = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
+    re_pdf = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -82,5 +85,6 @@ def get_abs_pdf_urls(self, url):
             raise URLResolutionError("ECCC", url)
         return abs_url, pdf_url
 
+    @staticmethod
     def validate(src):
         return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src)