Skip to content

Commit

Permalink
Adds test for html sourced paper, arxivce-1356
Browse files Browse the repository at this point in the history
Uses newer arxiv-base
  • Loading branch information
bdc34 committed Mar 20, 2024
1 parent 467364e commit 6b8feb9
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 40 deletions.
4 changes: 3 additions & 1 deletion browse/controllers/files/dissemination.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,11 @@ def _html_response(format: FileFormat,
arxiv_id: Identifier,
docmeta: DocMetadata,
version: VersionEntry) -> Response:
if docmeta.source_format == 'html':
if docmeta.source_format == 'html' or version.source_flag.html:
if isinstance(file_list, FileObj):
return _html_source_single_response(file_list, arxiv_id)
elif len(file_list) == 1:
return _html_source_single_response(file_list[0], arxiv_id)
else:
return _html_source_listing_response(file_list, arxiv_id)
elif isinstance(file_list, FileObj):
Expand Down
3 changes: 2 additions & 1 deletion browse/services/dissemination/article_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,8 @@ def _e_print(self,

def _html(self, arxiv_id: Identifier, docmeta: DocMetadata, version: VersionEntry) -> FormatHandlerReturn:
"""Gets the html src as submitted for the arxiv_id. Returns `FileObj` if found, `None` if not."""
if docmeta.source_format == 'html': # paper source is html
if docmeta.source_format == 'html' or version.source_flag.html: # paper source is html
# note: the preprocessed html is expected to exist in the ps_cache
path = ps_cache_html_path(arxiv_id, version.version)
if arxiv_id.extra: # requesting a specific file
return self.objstore.to_obj(path + arxiv_id.extra)
Expand Down
71 changes: 40 additions & 31 deletions browse/services/html_processing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import arxiv.document.exceptions
from flask import render_template, url_for
from arxiv.identifier import Identifier
from arxiv.identifier import Identifier, IdentifierException
import re
from io import BytesIO
import urllib.parse
from arxiv.document.metadata import DocMetadata
from browse.services.documents import get_doc_service
from browse.controllers.list_page import dl_for_article, latexml_links_for_article, authors_for_article
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

LAX_ID_REGEX = b'(arXiv:)?([a-z-]+(\.[A-Z][A-Z])?\/\d{7}|\d{4}\.\d{4,5})(v\d+)?'

Expand All @@ -16,36 +21,40 @@ def post_process_html(byte_line:bytes) -> bytes:
report_no_match = re.match(b'^\s*REPORT-NO:([A-Za-z0-9-\/]+)', byte_line, re.I)

if list_match:
cmd = list_match.group(1) #which command to perform
if cmd==b'ABS':
include_abstract=True
else:
include_abstract=False
id = list_match.group(2).decode('utf-8') #document ID
arxiv_id=Identifier(id)

new_html = "<dl>\n"

if arxiv_id:
#get and format metadata here as html
metadata=get_doc_service().get_abs(arxiv_id)
downloads= dl_for_article(metadata)
latexml=latexml_links_for_article(metadata)
author_links=authors_for_article(metadata)
item_string=render_template('list/conference_item.html',
item=metadata,
include_abstract=include_abstract,
downloads=downloads,
latexml=latexml,
author_links=author_links,
url_for_author_search=author_query )

new_html+= item_string
else:
new_html += f"<dd>{id} [failed to get identifier for paper]</dd>\n"

new_html += "</dl>\n"
new_bytes=new_html.encode('utf-8')
try:
cmd = list_match.group(1) #which command to perform
if cmd==b'ABS':
include_abstract=True
else:
include_abstract=False
id = list_match.group(2).decode('utf-8') #document ID
arxiv_id=Identifier(id)

new_html = "<dl>\n"

if arxiv_id:
#get and format metadata here as html
metadata=get_doc_service().get_abs(arxiv_id)
downloads= dl_for_article(metadata)
latexml=latexml_links_for_article(metadata)
author_links=authors_for_article(metadata)
item_string=render_template('list/conference_item.html',
item=metadata,
include_abstract=include_abstract,
downloads=downloads,
latexml=latexml,
author_links=author_links,
url_for_author_search=author_query )

new_html+= item_string
else:
new_html += f"<dd>{id} [failed to get identifier for paper]</dd>\n"

new_html += "</dl>\n"
new_bytes=new_html.encode('utf-8')
except (arxiv.document.exceptions.AbsException, IdentifierException ) as ee:
new_bytes = byte_line
logger.warning(f"Source of html paper had a problem during post_process_html: {ee}")

elif report_no_match: #need to find proceeding to test with
rn = report_no_match.group(1).decode('utf-8')
Expand Down
3 changes: 1 addition & 2 deletions get_test_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@
import argparse
from pathlib import Path

from arxiv.files import key_patterns
from arxiv.identifier import Identifier

from google.cloud import storage

from browse.services import key_patterns


def get_article_for_test(bucket, save_base_dir: str, arxiv_id: Identifier):
"""Gets from the production bucket all the files related to an arxiv_id
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ types-python-dateutil = "^2.8.19.12"
pg8000 = "^1.30.1"
lxml = "^4.9.2"
xmltodict = "^0.13.0"
arxiv-base = {git = "https://github.com/arXiv/arxiv-base.git", rev = "8b3c9cb0b1fc082ad9881f17cc5f942ced80767a"}
arxiv-base = {git = "https://github.com/arXiv/arxiv-base.git", rev = "f2b07f5e"}
flask = "^3.0.2"
google-cloud-compute = "^1.14.1"

Expand Down
17 changes: 17 additions & 0 deletions tests/data/abs_files/ftp/arxiv/papers/2403/2403.10561.abs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
------------------------------------------------------------------------------
\\
arXiv:2403.10561
From: Dimitris Spathis <example@example.com>
Date: Thu, 14 Mar 2024 08:46:07 GMT (1kb,H)

Title: A collection of the accepted papers for the Human-Centric Representation
Learning workshop at AAAI 2024
Authors: Dimitris Spathis, Aaqib Saeed, Ali Etemad, Sana Tonekaboni, Stefanos
Laskaridis, Shohreh Deldari, Chi Ian Tang, Patrick Schwab, Shyam Tailor
Categories: cs.LG cs.AI
License: http://creativecommons.org/licenses/by-nc-nd/4.0/
\\
This non-archival index is not complete, as some accepted papers chose to
opt-out of inclusion. The list of all accepted papers is available on the
workshop website.
\\
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<html>
<head>
<title>Human-Centric Representation Learning workshop at AAAI 2024</title>
</head>

<body>
<h1>A collection of the accepted papers for the Human-Centric Representation Learning workshop at AAAI 2024</h1>


Editors: Dimitris Spathis, Aaqib Saeed, Ali Etemad, Sana Tonekaboni, Stefanos Laskaridis, Shohreh Deldari, Chi Ian Tang, Patrick Schwab, Shyam Tailor<br />

<p>This non-archival index is not complete, as some accepted papers chose to opt-out of inclusion. The list of all accepted papers is available on the <a href="https://hcrl-workshop.github.io/2024/papers">workshop website</a>.</p>

<!-- LaFFi: Leveraging Hybrid Natural Language Feedback for Fine-tuning Language Models -->
LIST:arXiv:2401.00907

<!-- Evaluating Fairness in Self-supervised and Supervised Models for Sequential Data -->
LIST:arXiv:2401.01640

<!-- Advancing Ante Hoc Explainable Models through Generative Adversarial Networks -->
LIST:arXiv:2401.04647

<!-- H2G2-Net: A Hierarchical Heterogeneous Graph Generative Network Framework for Discovery of Multi-Modal Physiological Responses -->
LIST:arXiv:2401.02905

<!-- Improving Activation Steering in Language Models with Mean-Centring -->
LIST:arXiv:2312.03813

<!-- Learning Human-like Representations to Enable Learning Human Values -->
LIST:arXiv:2312.14106

<!-- Semi-Supervised Graph Representation Learning with Human-centric Explanation for Predicting Fatty Liver Disease -->
LIST:arXiv:2403.02786

<!-- Representation Learning for Wearable-Based Applications in the Case of Missing Data -->
LIST:arXiv:2401.05437

<!-- Balancing Continual Learning and Fine-tuning for Human Activity Recognition -->
LIST:arXiv:2401.02255


</body>
</html>
11 changes: 11 additions & 0 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def test_html_paper(client_with_test_fs):
"""Test a paper with html source."""
resp = client_with_test_fs.head("/abs/2403.10561")
assert resp.status_code == 200

resp = client_with_test_fs.get("/html/2403.10561/shouldnotexist.html")
assert resp.status_code == 404

resp = client_with_test_fs.get("/html/2403.10561")
assert resp.status_code == 200
assert "Human-Centric" in resp.data.decode()

0 comments on commit 6b8feb9

Please sign in to comment.