Adds test for html sourced paper, arxivce-1356

Uses newer arxiv-base
ConnectedPapers · Mar 20, 2024 · 6b8feb9 · 6b8feb9
1 parent 467364e
commit 6b8feb9
Show file tree

Hide file tree

Showing 10 changed files with 122 additions and 40 deletions.
diff --git a/browse/controllers/files/dissemination.py b/browse/controllers/files/dissemination.py
@@ -192,9 +192,11 @@ def _html_response(format: FileFormat,
                    arxiv_id: Identifier,
                    docmeta: DocMetadata,
                    version: VersionEntry) -> Response:
-    if docmeta.source_format == 'html':
+    if docmeta.source_format == 'html' or version.source_flag.html:
         if isinstance(file_list, FileObj):
             return _html_source_single_response(file_list, arxiv_id)
+        elif len(file_list) == 1:
+            return _html_source_single_response(file_list[0], arxiv_id)
         else:
             return _html_source_listing_response(file_list, arxiv_id)
     elif isinstance(file_list, FileObj):

diff --git a/browse/services/dissemination/article_store.py b/browse/services/dissemination/article_store.py
@@ -447,7 +447,8 @@ def _e_print(self,
 
     def _html(self, arxiv_id: Identifier, docmeta: DocMetadata, version: VersionEntry) -> FormatHandlerReturn:
         """Gets the html src as submitted for the arxiv_id. Returns `FileObj` if found, `None` if not."""
-        if docmeta.source_format == 'html': # paper source is html
+        if docmeta.source_format == 'html' or version.source_flag.html: # paper source is html
+            # note: the preprocessed html is expected to exist in the ps_cache
             path = ps_cache_html_path(arxiv_id, version.version)
             if arxiv_id.extra:  # requesting a specific file
                 return self.objstore.to_obj(path + arxiv_id.extra)

diff --git a/browse/services/html_processing/__init__.py b/browse/services/html_processing/__init__.py
@@ -1,11 +1,16 @@
+import arxiv.document.exceptions
 from flask import render_template, url_for
-from arxiv.identifier import Identifier
+from arxiv.identifier import Identifier, IdentifierException
 import re
 from io import BytesIO
 import urllib.parse
 from arxiv.document.metadata import DocMetadata
 from browse.services.documents import get_doc_service
 from browse.controllers.list_page import dl_for_article, latexml_links_for_article, authors_for_article
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 LAX_ID_REGEX = b'(arXiv:)?([a-z-]+(\.[A-Z][A-Z])?\/\d{7}|\d{4}\.\d{4,5})(v\d+)?'
 
@@ -16,36 +21,40 @@ def post_process_html(byte_line:bytes) -> bytes:
     report_no_match = re.match(b'^\s*REPORT-NO:([A-Za-z0-9-\/]+)', byte_line, re.I)
 
     if list_match:
-        cmd = list_match.group(1) #which command to perform
-        if cmd==b'ABS':
-            include_abstract=True
-        else:
-            include_abstract=False
-        id = list_match.group(2).decode('utf-8') #document ID
-        arxiv_id=Identifier(id) 
-
-        new_html = "<dl>\n"
-
-        if arxiv_id:
-            #get and format metadata here as html
-            metadata=get_doc_service().get_abs(arxiv_id)
-            downloads= dl_for_article(metadata)
-            latexml=latexml_links_for_article(metadata)
-            author_links=authors_for_article(metadata)
-            item_string=render_template('list/conference_item.html', 
-                                        item=metadata, 
-                                        include_abstract=include_abstract, 
-                                        downloads=downloads, 
-                                        latexml=latexml, 
-                                        author_links=author_links,
-                                        url_for_author_search=author_query )     
-
-            new_html+= item_string
-        else:
-            new_html += f"<dd>{id} [failed to get identifier for paper]</dd>\n"
-
-        new_html += "</dl>\n"
-        new_bytes=new_html.encode('utf-8')
+        try:
+            cmd = list_match.group(1) #which command to perform
+            if cmd==b'ABS':
+                include_abstract=True
+            else:
+                include_abstract=False
+            id = list_match.group(2).decode('utf-8') #document ID
+            arxiv_id=Identifier(id)
+
+            new_html = "<dl>\n"
+
+            if arxiv_id:
+                #get and format metadata here as html
+                metadata=get_doc_service().get_abs(arxiv_id)
+                downloads= dl_for_article(metadata)
+                latexml=latexml_links_for_article(metadata)
+                author_links=authors_for_article(metadata)
+                item_string=render_template('list/conference_item.html',
+                                            item=metadata,
+                                            include_abstract=include_abstract,
+                                            downloads=downloads,
+                                            latexml=latexml,
+                                            author_links=author_links,
+                                            url_for_author_search=author_query )
+
+                new_html+= item_string
+            else:
+                new_html += f"<dd>{id} [failed to get identifier for paper]</dd>\n"
+
+            new_html += "</dl>\n"
+            new_bytes=new_html.encode('utf-8')
+        except (arxiv.document.exceptions.AbsException, IdentifierException ) as ee:
+            new_bytes = byte_line
+            logger.warning(f"Source of html paper had a problem during post_process_html: {ee}")
 
     elif report_no_match: #need to find proceeding to test with
         rn = report_no_match.group(1).decode('utf-8')

diff --git a/get_test_article.py b/get_test_article.py
@@ -11,12 +11,11 @@
 import argparse
 from pathlib import Path
 
+from arxiv.files import key_patterns
 from arxiv.identifier import Identifier
 
 from google.cloud import storage
 
-from browse.services import key_patterns
-
 
 def get_article_for_test(bucket, save_base_dir: str, arxiv_id: Identifier):
     """Gets from the production bucket all the files related to an arxiv_id

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ types-python-dateutil = "^2.8.19.12"
 pg8000 = "^1.30.1"
 lxml = "^4.9.2"
 xmltodict = "^0.13.0"
-arxiv-base = {git = "https://github.com/arXiv/arxiv-base.git", rev = "8b3c9cb0b1fc082ad9881f17cc5f942ced80767a"}
+arxiv-base = {git = "https://github.com/arXiv/arxiv-base.git", rev = "f2b07f5e"}
 flask = "^3.0.2"
 google-cloud-compute = "^1.14.1"
 

diff --git a/tests/data/abs_files/ftp/arxiv/papers/2403/2403.10561.abs b/tests/data/abs_files/ftp/arxiv/papers/2403/2403.10561.abs
@@ -0,0 +1,17 @@
+------------------------------------------------------------------------------
+\\
+arXiv:2403.10561
+From: Dimitris Spathis <example@example.com>
+Date: Thu, 14 Mar 2024 08:46:07 GMT   (1kb,H)
+
+Title: A collection of the accepted papers for the Human-Centric Representation
+  Learning workshop at AAAI 2024
+Authors: Dimitris Spathis, Aaqib Saeed, Ali Etemad, Sana Tonekaboni, Stefanos
+  Laskaridis, Shohreh Deldari, Chi Ian Tang, Patrick Schwab, Shyam Tailor
+Categories: cs.LG cs.AI
+License: http://creativecommons.org/licenses/by-nc-nd/4.0/
+\\
+  This non-archival index is not complete, as some accepted papers chose to
+opt-out of inclusion. The list of all accepted papers is available on the
+workshop website.
+\\
diff --git a/tests/data/abs_files/ftp/arxiv/papers/2403/2403.10561.html.gz b/tests/data/abs_files/ftp/arxiv/papers/2403/2403.10561.html.gz
diff --git a/tests/data/abs_files/ps_cache/arxiv/html/2403/2403.10561v1/2403.10561.html b/tests/data/abs_files/ps_cache/arxiv/html/2403/2403.10561v1/2403.10561.html
@@ -0,0 +1,43 @@
+<html>
+<head>
+<title>Human-Centric Representation Learning workshop at AAAI 2024</title>
+</head>
+
+<body>
+<h1>A collection of the accepted papers for the Human-Centric Representation Learning workshop at AAAI 2024</h1>
+
+
+Editors: Dimitris Spathis, Aaqib Saeed, Ali Etemad, Sana Tonekaboni, Stefanos Laskaridis, Shohreh Deldari, Chi Ian Tang, Patrick Schwab, Shyam Tailor<br />
+
+ <p>This non-archival index is not complete, as some accepted papers chose to opt-out of inclusion. The list of all accepted papers is available on the <a href="https://hcrl-workshop.github.io/2024/papers">workshop website</a>.</p>
+
+<!-- LaFFi: Leveraging Hybrid Natural Language Feedback for Fine-tuning Language Models -->
+LIST:arXiv:2401.00907
+
+<!-- Evaluating Fairness in Self-supervised and Supervised Models for Sequential Data -->
+LIST:arXiv:2401.01640
+
+<!-- Advancing Ante Hoc Explainable Models through Generative Adversarial Networks -->
+LIST:arXiv:2401.04647
+
+<!-- H2G2-Net: A Hierarchical Heterogeneous Graph Generative Network Framework for Discovery of Multi-Modal Physiological Responses -->
+LIST:arXiv:2401.02905
+
+<!-- Improving Activation Steering in Language Models with Mean-Centring -->
+LIST:arXiv:2312.03813
+
+<!-- Learning Human-like Representations to Enable Learning Human Values -->
+LIST:arXiv:2312.14106
+
+<!-- Semi-Supervised Graph Representation Learning with Human-centric Explanation for Predicting Fatty Liver Disease -->
+LIST:arXiv:2403.02786
+
+<!-- Representation Learning for Wearable-Based Applications in the Case of Missing Data -->
+LIST:arXiv:2401.05437
+
+<!-- Balancing Continual Learning and Fine-tuning for Human Activity Recognition -->
+LIST:arXiv:2401.02255
+
+
+</body>
+</html>
diff --git a/tests/test_html.py b/tests/test_html.py
@@ -0,0 +1,11 @@
+def test_html_paper(client_with_test_fs):
+    """Test a paper with html source."""
+    resp = client_with_test_fs.head("/abs/2403.10561")
+    assert resp.status_code == 200
+
+    resp = client_with_test_fs.get("/html/2403.10561/shouldnotexist.html")
+    assert resp.status_code == 404
+
+    resp = client_with_test_fs.get("/html/2403.10561")
+    assert resp.status_code == 200
+    assert "Human-Centric" in resp.data.decode()