Fixing 403 forbidden in test_openaccess_scraper (#92)

* Added search_pdf_link, with test * Integrated search_pdf_link into lib * Fully mocked openaccess scraper since we get 403's in CI * Fixing test_link2_to_pdf_that_can_raise_403 by also capturing ClientResponseError
blackadad · Apr 16, 2024 · 20df90f · 20df90f
1 parent e5d473a
commit 20df90f
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 54 deletions.
diff --git a/paperscraper/exceptions.py b/paperscraper/exceptions.py
@@ -6,3 +6,7 @@ def __init__(self, message="DOI not found"):
 
 class CitationConversionError(Exception):
     """Exception to throw when we can't process a citation from a BibTeX."""
+
+
+class NoPDFLinkError(Exception):
+    """Exception to throw when we can't find a PDF link."""
diff --git a/paperscraper/lib.py b/paperscraper/lib.py
@@ -14,11 +14,17 @@
 
 from aiohttp import ClientResponse, ClientResponseError, ClientSession, InvalidURL
 
-from .exceptions import CitationConversionError, DOINotFoundError
+from .exceptions import CitationConversionError, DOINotFoundError, NoPDFLinkError
 from .headers import get_header
 from .log_formatter import CustomFormatter
 from .scraper import Scraper
-from .utils import ThrottledClientSession, encode_id, find_doi, get_scheme_hostname
+from .utils import (
+    ThrottledClientSession,
+    encode_id,
+    find_doi,
+    get_scheme_hostname,
+    search_pdf_link,
+)
 
 year_extract_pattern = re.compile(r"\b\d{4}\b")
 
@@ -135,29 +141,23 @@ async def link_to_pdf(url, path, session: ClientSession) -> None:
     # I know this looks weird
     # I just need to try stuff and be able
     # to break out of flow if I find a pdf
-    def get_pdf():
+    def get_pdf() -> str:
         # try for chemrxiv special tag
         pdf_link = re.search(
             r'content="(https://chemrxiv.org/engage/api-gateway/chemrxiv/assets.*\.pdf)"',
             html_text,
         )
         if pdf_link:
             return pdf_link.group(1)
-        # maybe epdf
-        # should have pdf somewhere (could not be at end)
-        epdf_link = re.search(r'href="(\S+\.epdf)"', html_text)
-        if epdf_link:
-            return epdf_link.group(1).replace("epdf", "pdf")
-
-        # obvious thing
-        pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
-        if pdf_link:
-            return pdf_link.group(1)
-
-        # if we got here, we didn't find a pdf
-        raise RuntimeError(f"No PDF link found for {url}.")
+        try:
+            return search_pdf_link(html_text, epdf=True)
+        except NoPDFLinkError:
+            return search_pdf_link(html_text)
 
-    pdf_link = get_pdf()
+    try:
+        pdf_link = get_pdf()
+    except NoPDFLinkError as exc:
+        raise RuntimeError(f"No PDF link found for {url}.") from exc
     # check if the link is relative
     if pdf_link.startswith("/"):
         pdf_link = get_scheme_hostname(url) + pdf_link
@@ -183,13 +183,13 @@ async def find_pmc_pdf_link(pmc_id, session: ClientSession) -> str:
             raise RuntimeError(
                 f"Failed to download PubMed Central ID {pmc_id} from URL {url}."
             ) from exc
-        html_text = await r.text()
-        pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
-        if pdf_link is None:
+        try:
+            pdf_link = search_pdf_link(text=await r.text())
+        except NoPDFLinkError as exc:
             raise RuntimeError(
                 f"No PDF link matched for PubMed Central ID {pmc_id} from URL {url}."
-            )
-        return f"https://www.ncbi.nlm.nih.gov{pdf_link.group(1)}"
+            ) from exc
+        return f"https://www.ncbi.nlm.nih.gov{pdf_link}"
 
 
 async def pubmed_to_pdf(pubmed_id, path, session: ClientSession) -> None:

diff --git a/paperscraper/utils.py b/paperscraper/utils.py
@@ -16,6 +16,8 @@
 import aiohttp
 import fitz
 
+from paperscraper.exceptions import NoPDFLinkError
+
 logger = logging.getLogger(__name__)
 
 
@@ -183,3 +185,15 @@ def get_scheme_hostname(url: str) -> str:
         query="",
         fragment="",
     ).geturl()
+
+
+def search_pdf_link(text: str, epdf: bool = False) -> str:
+    if epdf:
+        epdf_link = re.search(r'href="(\S+\.epdf)"', text)
+        if epdf_link:
+            return epdf_link.group(1).replace("epdf", "pdf")
+    else:
+        pdf_link = re.search(r'href="(\S+\.pdf)"', text)
+        if pdf_link:
+            return pdf_link.group(1)
+    raise NoPDFLinkError("No PDF link found.")
diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py
@@ -12,7 +12,11 @@
 from pybtex.database import parse_string
 
 import paperscraper
-from paperscraper.exceptions import CitationConversionError, DOINotFoundError
+from paperscraper.exceptions import (
+    CitationConversionError,
+    DOINotFoundError,
+    NoPDFLinkError,
+)
 from paperscraper.headers import get_header
 from paperscraper.lib import (
     GOOGLE_SEARCH_MAX_PAGE_SIZE,
@@ -23,7 +27,7 @@
     openaccess_scraper,
     reconcile_doi,
 )
-from paperscraper.utils import ThrottledClientSession, find_doi
+from paperscraper.utils import ThrottledClientSession, find_doi, search_pdf_link
 
 
 class TestThrottledClientSession(IsolatedAsyncioTestCase):
@@ -277,39 +281,67 @@ async def test_pmc_to_pdf(self):
         assert paperscraper.check_pdf(path)
         os.remove(path)
 
+    def test_search_pdf_link(self) -> None:
+        for url, expected in (
+            ('<link rel="schema.DC" href="http://abc.org/DC/elements/1.0/" />', None),
+            (
+                '<a href="/doi/suppl/10.1010/spam.ham.0a0/some_file/abc_001.pdf" class="ext-link">PDF</a>',  # noqa: E501
+                "/doi/suppl/10.1010/spam.ham.0a0/some_file/abc_001.pdf",
+            ),
+            (
+                '<form method="POST" action="/deliver/fulltext/foo/71/1/spam-ham-123-456.pdf?itemId=%2Fcontent%2Fjournals%2F10.1010%2Fabc-def-012000-123&mimeType=pdf&containerItemId=content/journals/applesauce"\ntarget="/content/journals/10.1010/abc-def-012000-123-pdf" \ndata-title',  # noqa: E501
+                None,
+            ),
+            (
+                '<a href="#" class="fa fa-file-pdf-o access-options-icon"\nrole="button"><span class="sr-only">file format pdf download</span></a>',  # noqa: E501
+                None,
+            ),
+        ):
+            if isinstance(expected, str):
+                assert search_pdf_link(url) == expected
+            else:
+                try:
+                    search_pdf_link(url)
+                except NoPDFLinkError:
+                    pass
+                else:
+                    raise AssertionError("Should be unreachable")
+
     async def test_openaccess_scraper(self) -> None:
         assert not await openaccess_scraper(
             {"openAccessPdf": None}, MagicMock(), MagicMock()
         )
 
-        async with ThrottledClientSession(
-            rate_limit=RateLimits.SCRAPER.value, headers=get_header()
-        ) as session:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                await openaccess_scraper(
-                    {
-                        "openAccessPdf": {
-                            "url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
-                        }
-                    },
-                    os.path.join(tmpdir, "test1.pdf"),
-                    session,
-                )
-                try:
-                    # Confirm we can regex parse without a malformed URL error
-                    await openaccess_scraper(
-                        {
-                            "openAccessPdf": {
-                                "url": "https://www.annualreviews.org/doi/full/10.1146/annurev-physchem-042018-052331"
-                            }
-                        },
-                        os.path.join(tmpdir, "test2.pdf"),
-                        session,
-                    )
-                except RuntimeError as exc:
-                    assert "No PDF link" in str(exc)  # noqa: PT017
-                else:
-                    raise AssertionError("Expected to fail with a RuntimeError")
+        mock_session = MagicMock()
+        call_index = 0
+
+        @contextlib.asynccontextmanager
+        async def mock_session_get(*_, **__):
+            mock_response = MagicMock(spec_set=aiohttp.ClientResponse)
+            nonlocal call_index
+            call_index += 1
+            if call_index == 1:
+                mock_response.text.side_effect = [
+                    '<a class="suppl-anchor" href="/doi/suppl/10.1021/acs.nanolett.0c00513/suppl_file/nl0c00513_si_001.pdf">'  # noqa: E501
+                ]
+            else:
+                mock_response.headers = {
+                    "Content-Type": "application/pdf;charset=UTF-8"
+                }
+                mock_response.read.side_effect = [b"stub"]
+            yield mock_response
+
+        mock_session.get.side_effect = mock_session_get
+        with tempfile.TemporaryDirectory() as tmpdir:
+            await openaccess_scraper(
+                {
+                    "openAccessPdf": {
+                        "url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
+                    }
+                },
+                os.path.join(tmpdir, "test.pdf"),
+                mock_session,
+            )
 
     async def test_pubmed_to_pdf(self):
         path = "test.pdf"
@@ -330,7 +362,7 @@ async def test_link_to_pdf(self):
         assert paperscraper.check_pdf(path)
         os.remove(path)
 
-    async def test_link2_to_pdf_that_can_raise_403(self):
+    async def test_link2_to_pdf_that_can_raise_403(self) -> None:
         link = "https://journals.sagepub.com/doi/pdf/10.1177/1087057113498418"
         path = "test.pdf"
         try:
@@ -340,7 +372,7 @@ async def test_link2_to_pdf_that_can_raise_403(self):
                 await paperscraper.link_to_pdf(link, path, session)
             os.remove(path)
 
-        except RuntimeError as e:
+        except (RuntimeError, aiohttp.ClientResponseError) as e:
             assert "403" in str(e)  # noqa: PT017
 
     async def test_link3_to_pdf(self):