Deduplicating media on Context creation (#1153)

jamesbraza · web-flow · commit 80149a12308f · 2025-10-27T13:38:02.000-07:00
diff --git a/packages/paper-qa-docling/src/paperqa_docling/reader.py b/packages/paper-qa-docling/src/paperqa_docling/reader.py
@@ -1,10 +1,11 @@
 import collections
 import io
+import json
 import os
 from collections.abc import Mapping
 from importlib.metadata import version
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
 import docling
 from docling.datamodel.base_models import ConversionStatus
@@ -169,6 +170,19 @@ def parse_pdf_to_pages(  # noqa: PLR0912
                         f"Didn't yet handle 2+ picture description annotations {annotations}."
                     )
 
+                media_metadata["info_hashable"] = json.dumps(
+                    {
+                        k: (
+                            v
+                            if k != "bbox"
+                            # Enables bbox deduplication based on whole pixels,
+                            # since <1-px differences are just noise
+                            else tuple(round(x) for x in cast(tuple, v))
+                        )
+                        for k, v in media_metadata.items()
+                    },
+                    sort_keys=True,
+                )
                 content[str(page_num)][1].append(
                     ParsedMedia(
                         index=len(content[str(page_num)][1]),
@@ -193,18 +207,32 @@ def parse_pdf_to_pages(  # noqa: PLR0912
                 table_image_data.save(img_bytes, format="PNG")
                 img_bytes.seek(0)  # Reset pointer before read to avoid empty data
 
+                media_metadata = {
+                    "type": "table",
+                    "width": table_image_data.width,
+                    "height": table_image_data.height,
+                    "bbox": item.prov[0].bbox.as_tuple(),
+                    "images_scale": pipeline_options.images_scale,
+                }
+                media_metadata["info_hashable"] = json.dumps(
+                    {
+                        k: (
+                            v
+                            if k != "bbox"
+                            # Enables bbox deduplication based on whole pixels,
+                            # since <1-px differences are just noise
+                            else tuple(round(x) for x in cast(tuple, v))
+                        )
+                        for k, v in media_metadata.items()
+                    },
+                    sort_keys=True,
+                )
                 content[str(page_num)][1].append(
                     ParsedMedia(
                         index=len(content[str(page_num)][1]),
                         data=img_bytes.read(),
                         text=item.export_to_markdown(doc),
-                        info={
-                            "type": "table",
-                            "width": table_image_data.width,
-                            "height": table_image_data.height,
-                            "bbox": item.prov[0].bbox.as_tuple(),
-                            "images_scale": pipeline_options.images_scale,
-                        },
+                        info=media_metadata,
                     )
                 )
                 count_media += 1
diff --git a/packages/paper-qa-docling/tests/test_paperqa_docling.py b/packages/paper-qa-docling/tests/test_paperqa_docling.py
@@ -173,6 +173,25 @@ def test_page_range() -> None:
     assert "page_range=(1,20)" in parsed_text_p1_20.metadata.name
 
 
+def test_media_deduplication() -> None:
+    parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
+    assert isinstance(parsed_text.content, dict)
+    assert len(parsed_text.content) == 5, "Expected full PDF read"
+    all_media = [m for _, media in parsed_text.content.values() for m in media]  # type: ignore[misc]
+
+    all_images = [m for m in all_media if m.info.get("type") == "picture"]
+    assert len(all_images) == 5, "Expected each image to be read"
+    assert (
+        len(set(all_images)) <= 2
+    ), "Expected images on all pages beyond 1 to be deduplicated"
+
+    all_tables = [m for m in all_media if m.info.get("type") == "table"]
+    assert len(all_tables) == 5, "Expected each table to be read"
+    assert (
+        len(set(all_tables)) <= 2
+    ), "Expected tables on all pages beyond 1 to be deduplicated"
+
+
 def test_page_size_limit_denial() -> None:
     with pytest.raises(ImpossibleParsingError, match="char limit"):
         parse_pdf_to_pages(STUB_DATA_DIR / "paper.pdf", page_size_limit=10)  # chars
diff --git a/src/paperqa/core.py b/src/paperqa/core.py
@@ -231,7 +231,8 @@ async def _map_fxn_summary(  # noqa: PLR0912
     # but not spaces, to preserve text alignment
     cleaned_text = text.text.strip("\n")
     if summary_llm_model and prompt_templates:
-        media_text: list[str] = [m.text for m in text.media if m.text]
+        unique_media = list(dict.fromkeys(text.media))  # Preserve order
+        media_text: list[str] = [m.text for m in unique_media if m.text]
         data = {
             "question": question,
             "citation": citation,
@@ -254,8 +255,8 @@ async def _map_fxn_summary(  # noqa: PLR0912
                         Message.create_message(
                             text=message_prompt,
                             images=(
-                                [i.to_image_url() for i in text.media]
-                                if text.media
+                                [i.to_image_url() for i in unique_media]
+                                if unique_media
                                 else None
                             ),
                         ),
diff --git a/src/paperqa/types.py b/src/paperqa/types.py
@@ -8,7 +8,7 @@
 import os
 import re
 import warnings
-from collections.abc import Collection, Iterable, Mapping, Sequence
+from collections.abc import Collection, Hashable, Iterable, Mapping, Sequence
 from copy import deepcopy
 from datetime import UTC, datetime
 from enum import StrEnum
@@ -516,10 +516,14 @@ class ParsedMedia(BaseModel):
         ),
     )
 
+    def _get_info_hashable(self) -> Hashable:
+        if info_hashable := self.info.get("info_hashable"):
+            return cast(Hashable, info_hashable)
+        # We know info_hashable_hash key isn't present, so no need to filter it
+        return json.dumps(self.info, sort_keys=True)
+
     def __hash__(self) -> int:
-        return hash(
-            (self.index, self.data, self.text, json.dumps(self.info, sort_keys=True))
-        )
+        return hash((self.index, self.data, self.text, self._get_info_hashable()))
 
     def to_id(self) -> UUID:
         """Convert this media to a UUID4 suitable for a database ID."""
@@ -547,8 +551,7 @@ def __eq__(self, other) -> bool:
             self.index == other.index
             and self.data == other.data
             and self.text == other.text
-            and json.dumps(self.info, sort_keys=True)
-            == json.dumps(other.info, sort_keys=True)
+            and self._get_info_hashable() == other._get_info_hashable()
         )
 
     def to_image_url(self) -> str:
diff --git a/tests/duplicate_media_template.md b/tests/duplicate_media_template.md
@@ -0,0 +1,71 @@
+# SF Districts in the style of Andy Warhol
+
+<!-- pyml disable-num-lines 5 line-length -->
+
+[//]: # "To generate `stub_data/duplicate_media.pdf` from this:"
+[//]: # "1. `pandoc duplicate_media_template.md --standalone --self-contained -t html -o temp.html`"
+[//]: # "2. `Chromium --headless --disable-gpu --print-to-pdf=stub_data/duplicate_media.pdf --no-pdf-header-footer temp.html`"
+[//]: # "3. `rm temp.html`"
+
+<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
+
+Text under image 1.
+
+| Col1  | Col2  |
+| ----- | ----- |
+| Val11 | Val12 |
+| Val21 | Val11 |
+
+Text under table 1.
+
+<div style="page-break-after: always;"></div>
+
+<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
+
+Text under image 2.
+
+| Col1  | Col2  |
+| ----- | ----- |
+| Val11 | Val12 |
+| Val21 | Val11 |
+
+Text under table 2.
+
+<div style="page-break-after: always;"></div>
+
+<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
+
+Text under image 3.
+
+| Col1  | Col2  |
+| ----- | ----- |
+| Val11 | Val12 |
+| Val21 | Val11 |
+
+Text under table 3.
+
+<div style="page-break-after: always;"></div>
+
+<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
+
+Text under image 4.
+
+| Col1  | Col2  |
+| ----- | ----- |
+| Val11 | Val12 |
+| Val21 | Val11 |
+
+Text under table 4.
+
+<div style="page-break-after: always;"></div>
+
+<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>
+
+Text under image 5.
+
+| Col1  | Col2  |
+| ----- | ----- |
+| Val11 | Val12 |
+| Val21 | Val11 |
+
+Text under table 5.
diff --git a/tests/stub_data/duplicate_media.pdf b/tests/stub_data/duplicate_media.pdf
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -90,11 +90,12 @@ async def test_get_directory_index(
             "year",
         ], "Incorrect fields in index"
         assert not index.changed, "Expected index to not have changes at this point"
-        # bates.txt + empty.txt + flag_day.html + gravity_hill.md + influence.pdf + obama.txt + paper.pdf + pasa.pdf,
+        # bates.txt + empty.txt + flag_day.html + gravity_hill.md + influence.pdf
+        # + obama.txt + paper.pdf + pasa.pdf + duplicate_media.pdf,
         # but empty.txt fails to be added
         path_to_id = await index.index_files
         assert (
-            sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 7
+            sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 8
         ), "Incorrect number of parsed index files"
 
         with subtests.test(msg="check-txt-query"):
@@ -252,6 +253,7 @@ async def test_getting_manifest(
 
 EXPECTED_STUB_DATA_FILES = {
     "bates.txt",
+    "duplicate_media.pdf",
     "empty.txt",
     "flag_day.html",
     "gravity_hill.md",
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -2,6 +2,7 @@
 import contextlib
 import csv
 import io
+import json
 import os
 import pathlib
 import pickle
@@ -35,6 +36,7 @@
 )
 from lmi.llms import rate_limited
 from lmi.utils import VCR_DEFAULT_MATCH_ON, validate_image
+from paperqa_docling import parse_pdf_to_pages as docling_parse_pdf_to_pages
 from paperqa_pymupdf import parse_pdf_to_pages as pymupdf_parse_pdf_to_pages
 from paperqa_pypdf import parse_pdf_to_pages as pypdf_parse_pdf_to_pages
 from pytest_subtests import SubTests
@@ -1693,6 +1695,44 @@ async def test_images(stub_data_dir: Path) -> None:
     assert all(bool(c.used_images) for c in contexts_used)  # type: ignore[attr-defined]
 
 
+@pytest.mark.asyncio
+async def test_duplicate_media_context_creation(stub_data_dir: Path) -> None:
+    settings = Settings(
+        prompts={"summary_json_system": summary_json_multimodal_system_prompt},
+        parsing={"parse_pdf": docling_parse_pdf_to_pages},
+    )
+
+    docs = Docs()
+    assert await docs.aadd(
+        stub_data_dir / "duplicate_media.pdf",
+        citation="FutureHouse, 2025, Accessed now",  # Skip citation inference
+        title="SF Districts in the style of Andy Warhol",  # Skip title inference
+        settings=settings,
+    )
+    with patch.object(
+        LLMModel, "call_single", side_effect=LLMModel.call_single, autospec=True
+    ) as mock_call_single:
+        session = await docs.aquery(
+            "What districts neighbor the Western Addition?", settings=settings
+        )
+    context_user_msg = mock_call_single.await_args_list[0][1]["messages"][1]
+    assert isinstance(context_user_msg, Message)
+    assert context_user_msg.content
+    content_list = json.loads(context_user_msg.content)
+    assert isinstance(content_list, list)
+    assert (
+        sum("image_url" in x for x in content_list) < 5
+    ), "Expected some deduplication to take place during context creation"
+    assert (
+        sum(
+            district in session.answer
+            for district in ("The Avenues", "Golden Gate", "Civic Center", "Haight")
+        )
+        >= 2
+    ), "Expected at least two neighbors to be matched"
+    assert session.cost > 0
+
+
 @pytest.mark.asyncio
 async def test_images_corrupt(stub_data_dir: Path, caplog) -> None:
     settings = Settings.from_name("fast")