Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions packages/paper-qa-docling/src/paperqa_docling/reader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import collections
import io
import json
import os
from collections.abc import Mapping
from importlib.metadata import version
from pathlib import Path
from typing import Any
from typing import Any, cast

import docling
from docling.datamodel.base_models import ConversionStatus
Expand Down Expand Up @@ -169,6 +170,19 @@ def parse_pdf_to_pages( # noqa: PLR0912
f"Didn't yet handle 2+ picture description annotations {annotations}."
)

media_metadata["info_hashable"] = json.dumps(
{
k: (
v
if k != "bbox"
# Enables bbox deduplication based on whole pixels,
# since <1-px differences are just noise
else tuple(round(x) for x in cast(tuple, v))
)
for k, v in media_metadata.items()
},
sort_keys=True,
)
content[str(page_num)][1].append(
ParsedMedia(
index=len(content[str(page_num)][1]),
Expand All @@ -193,18 +207,32 @@ def parse_pdf_to_pages( # noqa: PLR0912
table_image_data.save(img_bytes, format="PNG")
img_bytes.seek(0) # Reset pointer before read to avoid empty data

media_metadata = {
"type": "table",
"width": table_image_data.width,
"height": table_image_data.height,
"bbox": item.prov[0].bbox.as_tuple(),
"images_scale": pipeline_options.images_scale,
}
media_metadata["info_hashable"] = json.dumps(
{
k: (
v
if k != "bbox"
# Enables bbox deduplication based on whole pixels,
# since <1-px differences are just noise
else tuple(round(x) for x in cast(tuple, v))
)
for k, v in media_metadata.items()
},
sort_keys=True,
)
content[str(page_num)][1].append(
ParsedMedia(
index=len(content[str(page_num)][1]),
data=img_bytes.read(),
text=item.export_to_markdown(doc),
info={
"type": "table",
"width": table_image_data.width,
"height": table_image_data.height,
"bbox": item.prov[0].bbox.as_tuple(),
"images_scale": pipeline_options.images_scale,
},
info=media_metadata,
)
)
count_media += 1
Expand Down
19 changes: 19 additions & 0 deletions packages/paper-qa-docling/tests/test_paperqa_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,25 @@ def test_page_range() -> None:
assert "page_range=(1,20)" in parsed_text_p1_20.metadata.name


def test_media_deduplication() -> None:
parsed_text = parse_pdf_to_pages(STUB_DATA_DIR / "duplicate_media.pdf")
assert isinstance(parsed_text.content, dict)
assert len(parsed_text.content) == 5, "Expected full PDF read"
all_media = [m for _, media in parsed_text.content.values() for m in media] # type: ignore[misc]

all_images = [m for m in all_media if m.info.get("type") == "picture"]
assert len(all_images) == 5, "Expected each image to be read"
assert (
len(set(all_images)) <= 2
), "Expected images on all pages beyond 1 to be deduplicated"

all_tables = [m for m in all_media if m.info.get("type") == "table"]
assert len(all_tables) == 5, "Expected each table to be read"
assert (
len(set(all_tables)) <= 2
), "Expected tables on all pages beyond 1 to be deduplicated"


def test_page_size_limit_denial() -> None:
with pytest.raises(ImpossibleParsingError, match="char limit"):
parse_pdf_to_pages(STUB_DATA_DIR / "paper.pdf", page_size_limit=10) # chars
Expand Down
7 changes: 4 additions & 3 deletions src/paperqa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ async def _map_fxn_summary( # noqa: PLR0912
# but not spaces, to preserve text alignment
cleaned_text = text.text.strip("\n")
if summary_llm_model and prompt_templates:
media_text: list[str] = [m.text for m in text.media if m.text]
unique_media = list(dict.fromkeys(text.media)) # Preserve order
media_text: list[str] = [m.text for m in unique_media if m.text]
data = {
"question": question,
"citation": citation,
Expand All @@ -254,8 +255,8 @@ async def _map_fxn_summary( # noqa: PLR0912
Message.create_message(
text=message_prompt,
images=(
[i.to_image_url() for i in text.media]
if text.media
[i.to_image_url() for i in unique_media]
if unique_media
else None
),
),
Expand Down
15 changes: 9 additions & 6 deletions src/paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import os
import re
import warnings
from collections.abc import Collection, Iterable, Mapping, Sequence
from collections.abc import Collection, Hashable, Iterable, Mapping, Sequence
from copy import deepcopy
from datetime import UTC, datetime
from enum import StrEnum
Expand Down Expand Up @@ -516,10 +516,14 @@ class ParsedMedia(BaseModel):
),
)

def _get_info_hashable(self) -> Hashable:
if info_hashable := self.info.get("info_hashable"):
return cast(Hashable, info_hashable)
# We know info_hashable_hash key isn't present, so no need to filter it
return json.dumps(self.info, sort_keys=True)

def __hash__(self) -> int:
return hash(
(self.index, self.data, self.text, json.dumps(self.info, sort_keys=True))
)
return hash((self.index, self.data, self.text, self._get_info_hashable()))

def to_id(self) -> UUID:
"""Convert this media to a UUID4 suitable for a database ID."""
Expand Down Expand Up @@ -547,8 +551,7 @@ def __eq__(self, other) -> bool:
self.index == other.index
and self.data == other.data
and self.text == other.text
and json.dumps(self.info, sort_keys=True)
== json.dumps(other.info, sort_keys=True)
and self._get_info_hashable() == other._get_info_hashable()
)

def to_image_url(self) -> str:
Expand Down
71 changes: 71 additions & 0 deletions tests/duplicate_media_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# SF Districts in the style of Andy Warhol

<!-- pyml disable-num-lines 5 line-length -->

[//]: # "To generate `stub_data/duplicate_media.pdf` from this:"
[//]: # "1. `pandoc duplicate_media_template.md --standalone --self-contained -t html -o temp.html`"
[//]: # "2. `Chromium --headless --disable-gpu --print-to-pdf=stub_data/duplicate_media.pdf --no-pdf-header-footer temp.html`"
[//]: # "3. `rm temp.html`"

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 1.

| Col1 | Col2 |
| ----- | ----- |
| Val11 | Val12 |
| Val21 | Val11 |

Text under table 1.

<div style="page-break-after: always;"></div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 2.

| Col1 | Col2 |
| ----- | ----- |
| Val11 | Val12 |
| Val21 | Val11 |

Text under table 2.

<div style="page-break-after: always;"></div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 3.

| Col1 | Col2 |
| ----- | ----- |
| Val11 | Val12 |
| Val21 | Val11 |

Text under table 3.

<div style="page-break-after: always;"></div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 4.

| Col1 | Col2 |
| ----- | ----- |
| Val11 | Val12 |
| Val21 | Val11 |

Text under table 4.

<div style="page-break-after: always;"></div>

<img src="stub_data/sf_districts.png" alt="Map of SF districts" height="200"/>

Text under image 5.

| Col1 | Col2 |
| ----- | ----- |
| Val11 | Val12 |
| Val21 | Val11 |

Text under table 5.
Binary file added tests/stub_data/duplicate_media.pdf
Binary file not shown.
6 changes: 4 additions & 2 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ async def test_get_directory_index(
"year",
], "Incorrect fields in index"
assert not index.changed, "Expected index to not have changes at this point"
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + influence.pdf + obama.txt + paper.pdf + pasa.pdf,
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + influence.pdf
# + obama.txt + paper.pdf + pasa.pdf + duplicate_media.pdf,
# but empty.txt fails to be added
path_to_id = await index.index_files
assert (
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 7
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 8
), "Incorrect number of parsed index files"

with subtests.test(msg="check-txt-query"):
Expand Down Expand Up @@ -252,6 +253,7 @@ async def test_getting_manifest(

EXPECTED_STUB_DATA_FILES = {
"bates.txt",
"duplicate_media.pdf",
"empty.txt",
"flag_day.html",
"gravity_hill.md",
Expand Down
40 changes: 40 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import contextlib
import csv
import io
import json
import os
import pathlib
import pickle
Expand Down Expand Up @@ -35,6 +36,7 @@
)
from lmi.llms import rate_limited
from lmi.utils import VCR_DEFAULT_MATCH_ON, validate_image
from paperqa_docling import parse_pdf_to_pages as docling_parse_pdf_to_pages
from paperqa_pymupdf import parse_pdf_to_pages as pymupdf_parse_pdf_to_pages
from paperqa_pypdf import parse_pdf_to_pages as pypdf_parse_pdf_to_pages
from pytest_subtests import SubTests
Expand Down Expand Up @@ -1693,6 +1695,44 @@ async def test_images(stub_data_dir: Path) -> None:
assert all(bool(c.used_images) for c in contexts_used) # type: ignore[attr-defined]


@pytest.mark.asyncio
async def test_duplicate_media_context_creation(stub_data_dir: Path) -> None:
settings = Settings(
prompts={"summary_json_system": summary_json_multimodal_system_prompt},
parsing={"parse_pdf": docling_parse_pdf_to_pages},
)

docs = Docs()
assert await docs.aadd(
stub_data_dir / "duplicate_media.pdf",
citation="FutureHouse, 2025, Accessed now", # Skip citation inference
title="SF Districts in the style of Andy Warhol", # Skip title inference
settings=settings,
)
with patch.object(
LLMModel, "call_single", side_effect=LLMModel.call_single, autospec=True
) as mock_call_single:
session = await docs.aquery(
"What districts neighbor the Western Addition?", settings=settings
)
context_user_msg = mock_call_single.await_args_list[0][1]["messages"][1]
assert isinstance(context_user_msg, Message)
assert context_user_msg.content
content_list = json.loads(context_user_msg.content)
assert isinstance(content_list, list)
assert (
sum("image_url" in x for x in content_list) < 5
), "Expected some deduplication to take place during context creation"
assert (
sum(
district in session.answer
for district in ("The Avenues", "Golden Gate", "Civic Center", "Haight")
)
>= 2
), "Expected at least two neighbors to be matched"
assert session.cost > 0


@pytest.mark.asyncio
async def test_images_corrupt(stub_data_dir: Path, caplog) -> None:
settings = Settings.from_name("fast")
Expand Down