Skip to content

Commit

Permalink
Fixes for various providers (#146)
Browse files Browse the repository at this point in the history
* Fix broken tests

* Code style and formatting

* Fixes for various providers

* Show debug info for resizing

* more code formatting

* Bump black version and reformat

* Bump minimum Python version to 3.9
  • Loading branch information
GjjvdBurg authored Aug 12, 2024
1 parent c645528 commit dd1a5b8
Show file tree
Hide file tree
Showing 34 changed files with 218 additions and 127 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: [ 'ubuntu-latest' ]
strategy:
matrix:
py: [ '3.8', '3.11' ] # minimum required and latest stable
py: [ '3.9', '3.11' ] # minimum required and latest stable

steps:
- name: Install Python ${{ matrix.py }}
Expand All @@ -32,7 +32,7 @@ jobs:
- name: Run code quality tests (black)
uses: psf/black@stable
with:
version: "23.3.0"
version: "24.8.0"

- name: Run code quality tests (isort)
uses: jamescurtin/isort-action@master
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# NOTE: Keep versions in sync with Github Actions test.yml
repos:
- repo: https://github.com/psf/black
rev: 23.3.0
rev: 24.8.0
hooks:
- id: black
language_version: python3
Expand Down
13 changes: 8 additions & 5 deletions paper2remarkable/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ def __init__(self, provider, url, reason=None):
self.reason = reason

def __str__(self):
msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format(
provider=self.provider, url=self.url
msg = (
f"ERROR: Couldn't figure out {self.provider} URLs from provided "
f"url: {self.url}"
)
if self.reason:
msg += "\nReason: {reason}".format(reason=self.reason)
Expand All @@ -53,8 +54,9 @@ def __init__(self, provider, url, reason=None):
self.reason = reason

def __str__(self):
msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format(
provider=self.provider, url=self.url
msg = (
f"ERROR: Couldn't determine a filename from {self.url} for "
f"provider {self.provider}"
)
if self.reason:
msg += "\nReason: {reason}".format(reason=self.reason)
Expand Down Expand Up @@ -153,6 +155,7 @@ def __str__(self):
)
return msg


class FulltextMissingError(Error):
"""Exception raised when the fulltext PDF can't be found."""

Expand All @@ -166,4 +169,4 @@ def __str__(self):
f"\t{self.provider}\n"
f"\t{self.url}\n"
)
return msg
return msg
2 changes: 1 addition & 1 deletion paper2remarkable/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def disable(self):
def _log(self, msg, mode, end="\n", add_prefix=True):
if not self.enabled:
return
if not mode in ("info", "warn"):
if mode not in ("info", "warn"):
raise ValueError("Unknown logging mode: %s" % mode)
file = sys.stdout if mode == "info" else sys.stderr
if add_prefix:
Expand Down
15 changes: 14 additions & 1 deletion paper2remarkable/pdf_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ def blank_pdf(filepath):
return output_file


def _filesize_string(size: int) -> str:
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size < 1024:
return f"{size:.2f} {unit}"
size /= 1024


def shrink_pdf(filepath, gs_path="gs"):
"""Shrink the PDF file size using Ghostscript"""
logger.info("Shrinking pdf file ...")
Expand All @@ -79,8 +86,14 @@ def shrink_pdf(filepath, gs_path="gs"):
if not status == 0:
logger.warning("Failed to shrink the pdf file")
return filepath

size_after = os.path.getsize(output_file)
if size_after > size_before:
logger.info("Shrinking has no effect for this file, using original.")
size_str = _filesize_string(size_before)
logger.info(
f"Shrinking has no effect for this file, using original ({size_str})."
)
return filepath
size_str = _filesize_string(size_after)
logger.info(f"Shrinking brought filesize down to {size_str}")
return output_file
2 changes: 2 additions & 0 deletions paper2remarkable/providers/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def rewrite_pdf(self, in_file, out_pdf=None):
self.gs_path,
"-sDEVICE=pdfwrite",
"-dQUIET",
"-dWriteXRefStm=false",
"-dWriteObjStms=false",
"-o",
out_pdf,
in_file,
Expand Down
9 changes: 5 additions & 4 deletions paper2remarkable/providers/acl.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def _format_authors(self, soup_authors):


class ACL(Provider):
re_abs_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
re_abs_2 = "^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]+)"
re_pdf_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
re_pdf_2 = "^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
re_abs_1 = r"^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
re_abs_2 = r"^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]+)"
re_pdf_1 = r"^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
re_pdf_2 = r"^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -59,6 +59,7 @@ def get_abs_pdf_urls(self, url):

raise URLResolutionError("ACL", url)

@staticmethod
def validate(src):
return (
re.match(ACL.re_pdf_1, src)
Expand Down
16 changes: 9 additions & 7 deletions paper2remarkable/providers/acm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,30 @@ class ACMInformer(Informer):
meta_author_key = "citation_authors"

def get_title(self, soup):
target = soup.find("h1", {"class": "citation__title"})
target = soup.find("div", {"class": "core-publication-title"})
return target.text

def get_authors(self, soup):
authors = [
a["title"] for a in soup.find_all("a", {"class": "author-name"})
author_block.find("span", {"property": "familyName"}).text
for author_block in soup.find_all("span", {"property": "author"})
]
return self._format_authors(authors)
return authors

def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)

def get_year(self, soup):
date = soup.find("span", {"class": "epub-section__date"})
date = soup.find("span", {"class": "core-date-published"})
return self._format_year(date.text)

def _format_year(self, soup_date):
return soup_date.strip().split(" ")[-1].strip()


class ACM(Provider):
re_abs = "^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
re_pdf = "^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)(\?download=true)?"
re_abs = r"^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
re_pdf = r"^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)(\?download=true)?"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -71,6 +72,7 @@ def get_abs_pdf_urls(self, url):
raise URLResolutionError("ACM", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
m = re.match(ACM.re_abs, src) or re.match(ACM.re_pdf, src)
return not m is None
return m is not None
13 changes: 8 additions & 5 deletions paper2remarkable/providers/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ class ArxivInformer(Informer):


class Arxiv(Provider):
re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
re_abs_1 = r"https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
re_pdf_1 = r"https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?"

re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf"
re_abs_2 = r"https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
re_pdf_2 = r"https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?(\.pdf)?"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -50,12 +50,15 @@ def get_abs_pdf_urls(self, url):
abs_url = url
pdf_url = url.replace("abs", "pdf") + ".pdf"
elif re.match(self.re_pdf_1, url) or re.match(self.re_pdf_2, url):
abs_url = url[:-4].replace("pdf", "abs")
if url.endswith(".pdf"):
url = url[:-4]
abs_url = url.replace("pdf", "abs")
pdf_url = url
else:
raise URLResolutionError("arXiv", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
"""Check if the url is to an arXiv page."""
return (
Expand Down
11 changes: 6 additions & 5 deletions paper2remarkable/providers/citeseerx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"""

import re
import time

from ..exceptions import URLResolutionError
from ..log import Logger
Expand All @@ -24,13 +23,14 @@ class CiteSeerXInformer(Informer):
meta_date_key = "citation_year"

def _format_authors(self, soup_authors):
op = lambda x: x[0].split(",")
return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
return super()._format_authors(
soup_authors, sep=" ", idx=-1, op=lambda x: x[0].split(",")
)


class CiteSeerX(Provider):
re_abs = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
re_pdf = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"
re_abs = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
re_pdf = r"^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -72,6 +72,7 @@ def get_abs_pdf_urls(self, url):
raise URLResolutionError("CiteSeerX", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
return re.match(CiteSeerX.re_abs, src) or re.match(
CiteSeerX.re_pdf, src
Expand Down
7 changes: 4 additions & 3 deletions paper2remarkable/providers/cvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ class CVFInformer(Informer):


class CVF(Provider):
re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"
re_abs = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
re_pdf = r"^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -43,6 +43,7 @@ def get_abs_pdf_urls(self, url):
raise URLResolutionError("CVF", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src)
return not m is None
return m is not None
29 changes: 17 additions & 12 deletions paper2remarkable/providers/diva.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
"""

import os
import re
import urllib.parse

import bs4

from ..exceptions import URLResolutionError, FulltextMissingError
from ..exceptions import FulltextMissingError
from ..exceptions import URLResolutionError
from ..log import Logger
from ..utils import get_page_with_retry
from ._base import Provider
Expand All @@ -25,18 +24,23 @@

class DiVAInformer(Informer):
def get_year(self, soup):
year = soup.find("meta", {"name": "citation_publication_date"}).get("content")
year = soup.find("meta", {"name": "citation_publication_date"}).get(
"content"
)
if not year:
logger.warning(
"Couldn't determine year information, maybe provide the desired filename using '--filename'?"
"Couldn't determine year information, maybe provide the "
"desired filename using '--filename'?"
)
return ""
return year


class DiVA(Provider):
re_abs = "^https?://[a-z]+.diva-portal.org/smash/record.jsf"
re_pdf = "^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"
re_abs = r"^https?://[a-z]+.diva-portal.org/smash/record.jsf"
re_pdf = (
r"^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"
)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -48,16 +52,16 @@ def _get_doc_url(self, abs_url):

pdf_url = soup.find("meta", {"name": "citation_pdf_url"})
if pdf_url is None:
logger.warning(
"Couldn't find the fulltext URL"
)
logger.warning("Couldn't find the fulltext URL")
raise FulltextMissingError("DiVA", abs_url)

return pdf_url.get("content")

def _get_abs_url(self, pdf_url):
diva_id = re.findall("diva2:[0-9]+", pdf_url)[0].split(":")[1]
url_candiate = re.findall("https?://[a-z]+.diva-portal.org/smash/", pdf_url)[0]
url_candiate = re.findall(
"https?://[a-z]+.diva-portal.org/smash/", pdf_url
)[0]
url_candiate += "record.jsf?pid=diva2%3A" + diva_id
return url_candiate

Expand All @@ -72,5 +76,6 @@ def get_abs_pdf_urls(self, url):
raise URLResolutionError("DiVA", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
return re.match(DiVA.re_abs, src) or re.match(DiVA.re_pdf, src)
14 changes: 9 additions & 5 deletions paper2remarkable/providers/eccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def get_title(self, soup):
h4 = divsoup.find("h4")
if not h4:
logger.warning(
"Couldn't determine title information, maybe provide the desired filename using '--filename'?"
"Couldn't determine title information, maybe provide the "
"desired filename using '--filename'?"
)
return ""
return h4.get_text().strip()
Expand All @@ -43,7 +44,8 @@ def get_authors(self, soup):
)
if not aa:
logger.warning(
"Couldn't determine author information, maybe provide the desired filename using '--filename'?"
"Couldn't determine author information, maybe provide the "
"desired filename using '--filename'?"
)
return ""
authors = [a.get_text() for a in aa]
Expand All @@ -56,16 +58,17 @@ def get_year(self, soup):
)
if line is None:
logger.warning(
"Couldn't determine year information, maybe provide the desired filename using '--filename'?"
"Couldn't determine year information, maybe provide the "
"desired filename using '--filename'?"
)
return ""
year = line.strip().split(" ")[3] # bit lazy
return year


class ECCC(Provider):
re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"
re_abs = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
re_pdf = r"https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -82,5 +85,6 @@ def get_abs_pdf_urls(self, url):
raise URLResolutionError("ECCC", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src)
Loading

0 comments on commit dd1a5b8

Please sign in to comment.