Skip to content

Commit

Permalink
Fixing 403 forbidden in test_openaccess_scraper (#92)
Browse files Browse the repository at this point in the history
* Added search_pdf_link, with test

* Integrated search_pdf_link into lib

* Fully mocked openaccess scraper since we get 403's in CI

* Fixing test_link2_to_pdf_that_can_raise_403 by also capturing ClientResponseError
  • Loading branch information
jamesbraza authored Apr 16, 2024
1 parent e5d473a commit 20df90f
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 54 deletions.
4 changes: 4 additions & 0 deletions paperscraper/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ def __init__(self, message="DOI not found"):

class CitationConversionError(Exception):
"""Exception to throw when we can't process a citation from a BibTeX."""


class NoPDFLinkError(Exception):
"""Exception to throw when we can't find a PDF link."""
44 changes: 22 additions & 22 deletions paperscraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@

from aiohttp import ClientResponse, ClientResponseError, ClientSession, InvalidURL

from .exceptions import CitationConversionError, DOINotFoundError
from .exceptions import CitationConversionError, DOINotFoundError, NoPDFLinkError
from .headers import get_header
from .log_formatter import CustomFormatter
from .scraper import Scraper
from .utils import ThrottledClientSession, encode_id, find_doi, get_scheme_hostname
from .utils import (
ThrottledClientSession,
encode_id,
find_doi,
get_scheme_hostname,
search_pdf_link,
)

year_extract_pattern = re.compile(r"\b\d{4}\b")

Expand Down Expand Up @@ -135,29 +141,23 @@ async def link_to_pdf(url, path, session: ClientSession) -> None:
# I know this looks weird
# I just need to try stuff and be able
# to break out of flow if I find a pdf
def get_pdf():
def get_pdf() -> str:
# try for chemrxiv special tag
pdf_link = re.search(
r'content="(https://chemrxiv.org/engage/api-gateway/chemrxiv/assets.*\.pdf)"',
html_text,
)
if pdf_link:
return pdf_link.group(1)
# maybe epdf
# should have pdf somewhere (could not be at end)
epdf_link = re.search(r'href="(\S+\.epdf)"', html_text)
if epdf_link:
return epdf_link.group(1).replace("epdf", "pdf")

# obvious thing
pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
if pdf_link:
return pdf_link.group(1)

# if we got here, we didn't find a pdf
raise RuntimeError(f"No PDF link found for {url}.")
try:
return search_pdf_link(html_text, epdf=True)
except NoPDFLinkError:
return search_pdf_link(html_text)

pdf_link = get_pdf()
try:
pdf_link = get_pdf()
except NoPDFLinkError as exc:
raise RuntimeError(f"No PDF link found for {url}.") from exc
# check if the link is relative
if pdf_link.startswith("/"):
pdf_link = get_scheme_hostname(url) + pdf_link
Expand All @@ -183,13 +183,13 @@ async def find_pmc_pdf_link(pmc_id, session: ClientSession) -> str:
raise RuntimeError(
f"Failed to download PubMed Central ID {pmc_id} from URL {url}."
) from exc
html_text = await r.text()
pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
if pdf_link is None:
try:
pdf_link = search_pdf_link(text=await r.text())
except NoPDFLinkError as exc:
raise RuntimeError(
f"No PDF link matched for PubMed Central ID {pmc_id} from URL {url}."
)
return f"https://www.ncbi.nlm.nih.gov{pdf_link.group(1)}"
) from exc
return f"https://www.ncbi.nlm.nih.gov{pdf_link}"


async def pubmed_to_pdf(pubmed_id, path, session: ClientSession) -> None:
Expand Down
14 changes: 14 additions & 0 deletions paperscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import aiohttp
import fitz

from paperscraper.exceptions import NoPDFLinkError

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -183,3 +185,15 @@ def get_scheme_hostname(url: str) -> str:
query="",
fragment="",
).geturl()


def search_pdf_link(text: str, epdf: bool = False) -> str:
if epdf:
epdf_link = re.search(r'href="(\S+\.epdf)"', text)
if epdf_link:
return epdf_link.group(1).replace("epdf", "pdf")
else:
pdf_link = re.search(r'href="(\S+\.pdf)"', text)
if pdf_link:
return pdf_link.group(1)
raise NoPDFLinkError("No PDF link found.")
96 changes: 64 additions & 32 deletions tests/test_paperscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
from pybtex.database import parse_string

import paperscraper
from paperscraper.exceptions import CitationConversionError, DOINotFoundError
from paperscraper.exceptions import (
CitationConversionError,
DOINotFoundError,
NoPDFLinkError,
)
from paperscraper.headers import get_header
from paperscraper.lib import (
GOOGLE_SEARCH_MAX_PAGE_SIZE,
Expand All @@ -23,7 +27,7 @@
openaccess_scraper,
reconcile_doi,
)
from paperscraper.utils import ThrottledClientSession, find_doi
from paperscraper.utils import ThrottledClientSession, find_doi, search_pdf_link


class TestThrottledClientSession(IsolatedAsyncioTestCase):
Expand Down Expand Up @@ -277,39 +281,67 @@ async def test_pmc_to_pdf(self):
assert paperscraper.check_pdf(path)
os.remove(path)

def test_search_pdf_link(self) -> None:
for url, expected in (
('<link rel="schema.DC" href="http://abc.org/DC/elements/1.0/" />', None),
(
'<a href="/doi/suppl/10.1010/spam.ham.0a0/some_file/abc_001.pdf" class="ext-link">PDF</a>', # noqa: E501
"/doi/suppl/10.1010/spam.ham.0a0/some_file/abc_001.pdf",
),
(
'<form method="POST" action="/deliver/fulltext/foo/71/1/spam-ham-123-456.pdf?itemId=%2Fcontent%2Fjournals%2F10.1010%2Fabc-def-012000-123&mimeType=pdf&containerItemId=content/journals/applesauce"\ntarget="/content/journals/10.1010/abc-def-012000-123-pdf" \ndata-title', # noqa: E501
None,
),
(
'<a href="#" class="fa fa-file-pdf-o access-options-icon"\nrole="button"><span class="sr-only">file format pdf download</span></a>', # noqa: E501
None,
),
):
if isinstance(expected, str):
assert search_pdf_link(url) == expected
else:
try:
search_pdf_link(url)
except NoPDFLinkError:
pass
else:
raise AssertionError("Should be unreachable")

async def test_openaccess_scraper(self) -> None:
assert not await openaccess_scraper(
{"openAccessPdf": None}, MagicMock(), MagicMock()
)

async with ThrottledClientSession(
rate_limit=RateLimits.SCRAPER.value, headers=get_header()
) as session:
with tempfile.TemporaryDirectory() as tmpdir:
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
}
},
os.path.join(tmpdir, "test1.pdf"),
session,
)
try:
# Confirm we can regex parse without a malformed URL error
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://www.annualreviews.org/doi/full/10.1146/annurev-physchem-042018-052331"
}
},
os.path.join(tmpdir, "test2.pdf"),
session,
)
except RuntimeError as exc:
assert "No PDF link" in str(exc) # noqa: PT017
else:
raise AssertionError("Expected to fail with a RuntimeError")
mock_session = MagicMock()
call_index = 0

@contextlib.asynccontextmanager
async def mock_session_get(*_, **__):
mock_response = MagicMock(spec_set=aiohttp.ClientResponse)
nonlocal call_index
call_index += 1
if call_index == 1:
mock_response.text.side_effect = [
'<a class="suppl-anchor" href="/doi/suppl/10.1021/acs.nanolett.0c00513/suppl_file/nl0c00513_si_001.pdf">' # noqa: E501
]
else:
mock_response.headers = {
"Content-Type": "application/pdf;charset=UTF-8"
}
mock_response.read.side_effect = [b"stub"]
yield mock_response

mock_session.get.side_effect = mock_session_get
with tempfile.TemporaryDirectory() as tmpdir:
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
}
},
os.path.join(tmpdir, "test.pdf"),
mock_session,
)

async def test_pubmed_to_pdf(self):
path = "test.pdf"
Expand All @@ -330,7 +362,7 @@ async def test_link_to_pdf(self):
assert paperscraper.check_pdf(path)
os.remove(path)

async def test_link2_to_pdf_that_can_raise_403(self):
async def test_link2_to_pdf_that_can_raise_403(self) -> None:
link = "https://journals.sagepub.com/doi/pdf/10.1177/1087057113498418"
path = "test.pdf"
try:
Expand All @@ -340,7 +372,7 @@ async def test_link2_to_pdf_that_can_raise_403(self):
await paperscraper.link_to_pdf(link, path, session)
os.remove(path)

except RuntimeError as e:
except (RuntimeError, aiohttp.ClientResponseError) as e:
assert "403" in str(e) # noqa: PT017

async def test_link3_to_pdf(self):
Expand Down

0 comments on commit 20df90f

Please sign in to comment.