Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _make_chunk(
) -> Text:
media: list[ParsedMedia] = []
for pg_num in range(int(lower_page), int(upper_page) + 1):
pg_contents = cast(dict, parsed_text.content)[str(pg_num)]
pg_contents = cast(dict, parsed_text.content).get(str(pg_num))
if isinstance(pg_contents, tuple):
media.extend(pg_contents[1])
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
Expand Down
26 changes: 24 additions & 2 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,15 @@
)
from paperqa.prompts import CANNOT_ANSWER_PHRASE, summary_json_multimodal_system_prompt
from paperqa.prompts import qa_prompt as default_qa_prompt
from paperqa.readers import PDFParserFn, parse_image, read_doc
from paperqa.readers import PDFParserFn, chunk_pdf, parse_image, read_doc
from paperqa.settings import AsyncContextSerializer
from paperqa.types import ChunkMetadata, Context, ParsedMedia
from paperqa.types import (
ChunkMetadata,
Context,
ParsedMedia,
ParsedMetadata,
ParsedText,
)
from paperqa.utils import (
bytes_to_string,
clean_possessives,
Expand Down Expand Up @@ -1686,6 +1692,22 @@ async def test_images_corrupt(stub_data_dir: Path, caplog) -> None:
assert session.cost > 0, "Expected some costs to have been incurred in our attempt"


def test_missing_page_doesnt_crash_us() -> None:
stub_parsed_text = ParsedText(
content={
"1": "A",
# Page 2 was totally blank
"3": "C",
},
metadata=ParsedMetadata(parsing_libraries=["stub"], total_parsed_text_length=2),
)
stub_doc = Doc(docname="stub", citation="stub", dockey="stub")
(text,) = chunk_pdf(stub_parsed_text, stub_doc, chunk_chars=100, overlap=5)
assert text.doc == stub_doc
assert "1-3" in text.name
assert text.text == "AC"


def test_zotero() -> None:
from paperqa.contrib import ZoteroDB

Expand Down