|  | 
| 2 | 2 | import contextlib | 
| 3 | 3 | import csv | 
| 4 | 4 | import io | 
|  | 5 | +import json | 
| 5 | 6 | import os | 
| 6 | 7 | import pathlib | 
| 7 | 8 | import pickle | 
|  | 
| 35 | 36 | ) | 
| 36 | 37 | from lmi.llms import rate_limited | 
| 37 | 38 | from lmi.utils import VCR_DEFAULT_MATCH_ON, validate_image | 
|  | 39 | +from paperqa_docling import parse_pdf_to_pages as docling_parse_pdf_to_pages | 
| 38 | 40 | from paperqa_pymupdf import parse_pdf_to_pages as pymupdf_parse_pdf_to_pages | 
| 39 | 41 | from paperqa_pypdf import parse_pdf_to_pages as pypdf_parse_pdf_to_pages | 
| 40 | 42 | from pytest_subtests import SubTests | 
| @@ -1693,6 +1695,44 @@ async def test_images(stub_data_dir: Path) -> None: | 
| 1693 | 1695 |     assert all(bool(c.used_images) for c in contexts_used)  # type: ignore[attr-defined] | 
| 1694 | 1696 | 
 | 
| 1695 | 1697 | 
 | 
|  | 1698 | +@pytest.mark.asyncio | 
|  | 1699 | +async def test_duplicate_media_context_creation(stub_data_dir: Path) -> None: | 
|  | 1700 | +    settings = Settings( | 
|  | 1701 | +        prompts={"summary_json_system": summary_json_multimodal_system_prompt}, | 
|  | 1702 | +        parsing={"parse_pdf": docling_parse_pdf_to_pages}, | 
|  | 1703 | +    ) | 
|  | 1704 | + | 
|  | 1705 | +    docs = Docs() | 
|  | 1706 | +    assert await docs.aadd( | 
|  | 1707 | +        stub_data_dir / "duplicate_media.pdf", | 
|  | 1708 | +        citation="FutureHouse, 2025, Accessed now",  # Skip citation inference | 
|  | 1709 | +        title="SF Districts in the style of Andy Warhol",  # Skip title inference | 
|  | 1710 | +        settings=settings, | 
|  | 1711 | +    ) | 
|  | 1712 | +    with patch.object( | 
|  | 1713 | +        LLMModel, "call_single", side_effect=LLMModel.call_single, autospec=True | 
|  | 1714 | +    ) as mock_call_single: | 
|  | 1715 | +        session = await docs.aquery( | 
|  | 1716 | +            "What districts neighbor the Western Addition?", settings=settings | 
|  | 1717 | +        ) | 
|  | 1718 | +    context_user_msg = mock_call_single.await_args_list[0][1]["messages"][1] | 
|  | 1719 | +    assert isinstance(context_user_msg, Message) | 
|  | 1720 | +    assert context_user_msg.content | 
|  | 1721 | +    content_list = json.loads(context_user_msg.content) | 
|  | 1722 | +    assert isinstance(content_list, list) | 
|  | 1723 | +    assert ( | 
|  | 1724 | +        sum("image_url" in x for x in content_list) < 5 | 
|  | 1725 | +    ), "Expected some deduplication to take place during context creation" | 
|  | 1726 | +    assert ( | 
|  | 1727 | +        sum( | 
|  | 1728 | +            district in session.answer | 
|  | 1729 | +            for district in ("The Avenues", "Golden Gate", "Civic Center", "Haight") | 
|  | 1730 | +        ) | 
|  | 1731 | +        >= 2 | 
|  | 1732 | +    ), "Expected at least two neighbors to be matched" | 
|  | 1733 | +    assert session.cost > 0 | 
|  | 1734 | + | 
|  | 1735 | + | 
| 1696 | 1736 | @pytest.mark.asyncio | 
| 1697 | 1737 | async def test_images_corrupt(stub_data_dir: Path, caplog) -> None: | 
| 1698 | 1738 |     settings = Settings.from_name("fast") | 
|  | 
0 commit comments