Generalizing tests for smarter LLMs 2 (#1154)

jamesbraza · web-flow · commit c9bd88cbc414 · 2025-10-27T12:43:03.000-07:00
diff --git a/packages/paper-qa-docling/tests/test_paperqa_docling.py b/packages/paper-qa-docling/tests/test_paperqa_docling.py
@@ -10,7 +10,7 @@
 from lmi.utils import bytes_to_string
 from paperqa import Doc, Docs
 from paperqa.readers import PDFParserFn, chunk_pdf
-from paperqa.utils import ImpossibleParsingError
+from paperqa.utils import ImpossibleParsingError, get_citation_ids
 
 from paperqa_docling import parse_pdf_to_pages
 
@@ -87,8 +87,8 @@ async def test_parse_pdf_to_pages() -> None:
         ("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
         ("What actions can the Selector take?", [(("select", "drop"), 2)]),
         (
-            "How many User Query are there, and what do they do?",
-            [(("two", "2"), 2), (("crawler", "selector"), 2)],
+            "How many User Query blue boxes are there, and what are they connected to?",
+            [(("two", "2"), 1), (("crawler", "selector"), 2)],
         ),
     ):
         session = await docs.aquery(query=query)
@@ -97,12 +97,17 @@ async def test_parse_pdf_to_pages() -> None:
             c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
             for c in session.contexts
         ), "Expected context to reuse Figure 1's text and media"
+        # Remove citations so numeric assertions don't have false positives
+        raw_answer_no_citations = session.raw_answer
+        for key in get_citation_ids(session.raw_answer):
+            raw_answer_no_citations = raw_answer_no_citations.replace(f"({key})", "")
         for substrings, min_count in cast(
             list[tuple[tuple[str, ...], int]], substrings_min_counts
         ):
             assert (
-                sum(x in session.answer.lower() for x in substrings) >= min_count
-            ), f"Expected {session.answer=} to have at {substrings} present"
+                sum(x in raw_answer_no_citations.lower() for x in substrings)
+                >= min_count
+            ), f"Expected {raw_answer_no_citations=} to have {substrings} present"
 
     # Check the no-media behavior
     parsed_text_no_media = parse_pdf_to_pages(filepath, parse_media=False)
diff --git a/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py b/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
@@ -9,7 +9,7 @@
 from lmi.utils import bytes_to_string
 from paperqa import Doc, Docs, Settings
 from paperqa.readers import PDFParserFn, chunk_pdf
-from paperqa.utils import ImpossibleParsingError
+from paperqa.utils import ImpossibleParsingError, get_citation_ids
 
 from paperqa_pymupdf import parse_pdf_to_pages
 
@@ -82,8 +82,8 @@ async def test_parse_pdf_to_pages() -> None:
         ("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
         ("What actions can the Selector take?", [(("select", "drop"), 2)]),
         (
-            "How many User Query are there, and what do they do?",
-            [(("two", "2"), 2), (("crawler", "selector"), 2)],
+            "How many User Query blue boxes are there, and what are they connected to?",
+            [(("two", "2"), 1), (("crawler", "selector"), 2)],
         ),
     ):
         session = await docs.aquery(query=query)
@@ -92,12 +92,17 @@ async def test_parse_pdf_to_pages() -> None:
             c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
             for c in session.contexts
         ), "Expected context to reuse Figure 1's text and media"
+        # Remove citations so numeric assertions don't have false positives
+        raw_answer_no_citations = session.raw_answer
+        for key in get_citation_ids(session.raw_answer):
+            raw_answer_no_citations = raw_answer_no_citations.replace(f"({key})", "")
         for substrings, min_count in cast(
             list[tuple[tuple[str, ...], int]], substrings_min_counts
         ):
             assert (
-                sum(x in session.answer.lower() for x in substrings) >= min_count
-            ), f"Expected {session.answer=} to have at {substrings} present"
+                sum(x in raw_answer_no_citations.lower() for x in substrings)
+                >= min_count
+            ), f"Expected {raw_answer_no_citations=} to have {substrings} present"
 
     # Let's check the full page parsing behavior
     parsed_text_full_page = parse_pdf_to_pages(filepath, full_page=True)
diff --git a/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py b/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
@@ -9,7 +9,7 @@
 from lmi.utils import bytes_to_string
 from paperqa import Doc, Docs
 from paperqa.readers import PDFParserFn, chunk_pdf
-from paperqa.utils import ImpossibleParsingError
+from paperqa.utils import ImpossibleParsingError, get_citation_ids
 
 from paperqa_pypdf import parse_pdf_to_pages
 
@@ -85,8 +85,8 @@ async def test_parse_pdf_to_pages() -> None:
         ("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
         ("What actions can the Selector take?", [(("select", "drop"), 2)]),
         (
-            "How many User Query are there, and what do they do?",
-            [(("two", "2"), 2), (("crawler", "selector"), 2)],
+            "How many User Query blue boxes are there, and what are they connected to?",
+            [(("two", "2"), 1), (("crawler", "selector"), 2)],
         ),
     ):
         session = await docs.aquery(query=query)
@@ -95,12 +95,17 @@ async def test_parse_pdf_to_pages() -> None:
             c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
             for c in session.contexts
         ), "Expected context to reuse Figure 1's text and media"
+        # Remove citations so numeric assertions don't have false positives
+        raw_answer_no_citations = session.raw_answer
+        for key in get_citation_ids(session.raw_answer):
+            raw_answer_no_citations = raw_answer_no_citations.replace(f"({key})", "")
         for substrings, min_count in cast(
             list[tuple[tuple[str, ...], int]], substrings_min_counts
         ):
             assert (
-                sum(x in session.answer.lower() for x in substrings) >= min_count
-            ), f"Expected {session.answer=} to have at {substrings} present"
+                sum(x in raw_answer_no_citations.lower() for x in substrings)
+                >= min_count
+            ), f"Expected {raw_answer_no_citations=} to have {substrings} present"
 
     # Check the no-media behavior
     parsed_text_no_media = parse_pdf_to_pages(filepath, parse_media=False)
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1732,8 +1732,12 @@ async def test_images_corrupt(stub_data_dir: Path, caplog) -> None:
         "What districts neighbor the Western Addition?", settings=settings
     )
     assert not session.contexts, "Expected no contexts to be made from a bad image."
-    assert (
-        "unsupported image" in caplog.text
+    assert any(
+        x in caplog.text.lower()
+        for x in (
+            "unsupported image",  # OpenAI
+            "could not process image",  # Anthropic
+        )
     ), "Expected a caught exception about an unsupported image."
 
     # By suppressing the use of images, we can actually gather evidence now