Future-House · jamesbraza · Aug 6, 2025 · Jul 2, 2025 · Jul 7, 2025 · Jul 8, 2025
diff --git a/README.md b/README.md
@@ -880,6 +880,7 @@ will return much faster than the first query and we'll be certain the authors ma
 | `answer.evidence_retrieval`                  | `True`                                 | Use retrieval vs processing all docs.                                                                   |
 | `answer.evidence_summary_length`             | `"about 100 words"`                    | Length of evidence summary.                                                                             |
 | `answer.evidence_skip_summary`               | `False`                                | Whether to skip summarization.                                                                          |
+| `answer.evidence_text_only_fallback`         | `False`                                | Whether to allow context creation to retry without media present.                                       |
 | `answer.answer_max_sources`                  | `5`                                    | Max number of sources for an answer.                                                                    |
 | `answer.max_answer_attempts`                 | `None`                                 | Max attempts to generate an answer.                                                                     |
 | `answer.answer_length`                       | `"about 200 words, but can be longer"` | Length of final answer.                                                                                 |

diff --git a/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py b/packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
@@ -19,6 +19,7 @@ def test_parse_pdf_to_pages() -> None:
     parsed_text = parse_pdf_to_pages(filepath)
     assert isinstance(parsed_text.content, dict)
     assert "1" in parsed_text.content, "Parsed text should contain page 1"
+    assert isinstance(parsed_text.content["1"], str)
     matches = re.findall(
         r"Abstract\nWe introduce PaSa, an advanced Paper ?Search"
         r"\nagent powered by large language models.",

diff --git a/pyproject.toml b/pyproject.toml
@@ -61,7 +61,7 @@ dev = [
     "ipython>=8",  # Pin to keep recent
     "litellm>=1.68,<1.71",  # Lower pin for PydanticDeprecatedSince20 fixes, upper pin for VCR cassette breaks (https://github.com/BerriAI/litellm/issues/11724)
     "mypy>=1.8",  # Pin for mutable-override
-    "paper-qa[ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
+    "paper-qa[image,ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
     "pre-commit>=3.4",  # Pin to keep recent
     "pydantic~=2.11",  # Pin for start of model_fields deprecation
     "pylint-pydantic",
@@ -78,6 +78,9 @@ dev = [
     "typeguard",
     "vcrpy>=6",  # Pin for https://github.com/kevin1024/vcrpy/issues/884
 ]
+image = [
+    "pillow>=10.3.0",  # Pin for py.typed
+]
 ldp = [
     "ldp>=0.25.0",  # For new LLM client interface
 ]

diff --git a/src/paperqa/core.py b/src/paperqa/core.py
@@ -2,16 +2,21 @@
 
 import contextlib
 import json
+import logging
 import re
 from collections.abc import Callable, Sequence
 from typing import Any, cast
 
+import litellm
 from aviary.core import Message
 from lmi import LLMModel
 
+from paperqa.prompts import text_with_tables_prompt_template
 from paperqa.types import Context, LLMResult, Text
 from paperqa.utils import extract_score, strip_citations
 
+logger = logging.getLogger(__name__)
+
 
 def llm_parse_json(text: str) -> dict:
     """Read LLM output and extract JSON data from it."""
@@ -136,6 +141,7 @@ async def map_fxn_summary(
     parser: Callable[[str], dict[str, Any]] | None = None,
     callbacks: Sequence[Callable[[str], None]] | None = None,
     skip_citation_strip: bool = False,
+    evidence_text_only_fallback: bool = False,
 ) -> tuple[Context, LLMResult]:
     """Parses the given text and returns a context object with the parser and prompt runner.
 
@@ -154,6 +160,8 @@ async def map_fxn_summary(
             Should return dict with at least 'summary' field.
         callbacks: Optional sequence of callback functions to execute during LLM calls.
         skip_citation_strip: Optional skipping of citation stripping, if you want to keep in the context.
+        evidence_text_only_fallback: Opt-in flag to allow retrying context creation
+            without media in the completion.
 
     Returns:
         The context object and LLMResult to get info about the LLM execution.
@@ -163,25 +171,61 @@ async def map_fxn_summary(
     extras: dict[str, Any] = {}
     citation = text.name + ": " + text.doc.formatted_citation
     success = False
+    used_text_only_fallback = False
 
+    # Strip newlines in case chunking led to blank lines,
+    # but not spaces, to preserve text alignment
+    cleaned_text = text.text.strip("\n")
     if summary_llm_model and prompt_templates:
+        media_text: list[str] = [m.text for m in text.media if m.text]
         data = {
             "question": question,
             "citation": citation,
-            # Strip newlines in case chunking led to blank lines,
-            # but not spaces, to preserve text alignment
-            "text": text.text.strip("\n"),
+            "text": (
+                text_with_tables_prompt_template.format(
+                    text=cleaned_text,
+                    citation=citation,
+                    tables="\n\n----\n\n".join(media_text),
+                )
+                if media_text
+                else cleaned_text
+            ),
         } | (extra_prompt_data or {})
-        message_prompt, system_prompt = prompt_templates
-        messages = [
-            Message(role="system", content=system_prompt.format(**data)),
-            Message(role="user", content=message_prompt.format(**data)),
-        ]
-        llm_result = await summary_llm_model.call_single(
-            messages=messages,
-            callbacks=callbacks,
-            name="evidence:" + text.name,
-        )
+        message_prompt, system_prompt = (pt.format(**data) for pt in prompt_templates)
+        try:
+            llm_result = await summary_llm_model.call_single(
+                messages=[
+                    Message(role="system", content=system_prompt),
+                    Message.create_message(
+                        text=message_prompt,
+                        images=(
+                            [i.to_image_url() for i in text.media]
+                            if text.media
+                            else None
+                        ),
+                    ),
+                ],
+                callbacks=callbacks,
+                name="evidence:" + text.name,
+            )
+        except litellm.BadRequestError as exc:
+            if not evidence_text_only_fallback:
+                raise
+            logger.warning(
+                f"LLM call to create a context failed with exception {exc!r}"
+                f" on text named {text.name!r}"
+                f" with doc name {text.doc.docname!r} and doc key {text.doc.dockey!r}."
+                f" Retrying without media."
+            )
+            llm_result = await summary_llm_model.call_single(
+                messages=[
+                    Message(role="system", content=system_prompt),
+                    Message(content=message_prompt),
+                ],
+                callbacks=callbacks,
+                name="evidence:" + text.name,
+            )
+            used_text_only_fallback = True
         context = cast("str", llm_result.text)
         result_data = parser(context) if parser else {}
         success = bool(result_data)
@@ -199,9 +243,7 @@ async def map_fxn_summary(
             except KeyError:
                 success = False
     else:
-        # Strip newlines in case chunking led to blank lines,
-        # but not spaces, to preserve text alignment
-        context = text.text.strip("\n")
+        context = cleaned_text
         # If we don't assign scores, just default to 5.
         # why 5? Because we filter out 0s in another place
         # and 5/10 is the other default I could come up with
@@ -213,6 +255,8 @@ async def map_fxn_summary(
 
     if not success:
         score = extract_score(context)
+    if used_text_only_fallback:
+        extras["used_images"] = False
 
     return (
         Context(

diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py
@@ -380,17 +380,18 @@ async def aadd(  # noqa: PLR0912
                 doc, **(query_kwargs | kwargs)
             )
 
-        texts = await read_doc(
+        texts, metadata = await read_doc(
             path,
             doc,
             chunk_chars=parse_config.chunk_size,
             overlap=parse_config.overlap,
             page_size_limit=parse_config.page_size_limit,
             use_block_parsing=parse_config.pdfs_use_block_parsing,
             parse_pdf=parse_config.parse_pdf,
+            include_metadata=True,
         )
         # loose check to see if document was loaded
-        if (
+        if metadata.parse_type != "image" and (
             not texts
             or len(texts[0].text) < 10  # noqa: PLR2004
             or (
@@ -669,6 +670,7 @@ async def aget_evidence(
                         parser=llm_parse_json if prompt_config.use_json else None,
                         callbacks=callbacks,
                         skip_citation_strip=answer_config.skip_evidence_citation_strip,
+                        evidence_text_only_fallback=answer_config.evidence_text_only_fallback,
                     )
                     for m in matches
                 ],

diff --git a/src/paperqa/prompts.py b/src/paperqa/prompts.py
@@ -1,20 +1,26 @@
 from datetime import datetime
 
-# ruff: noqa: E501
-
 summary_prompt = (
     "Summarize the excerpt below to help answer a question.\n\nExcerpt from"
-    " {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly"
+    " {citation}\n\n------------\n\n{text}\n\n------------"
+    "\n\nQuestion: {question}\n\nDo not directly"
     " answer the question, instead summarize to give evidence to help answer the"
     " question. Stay detailed; report specific numbers, equations, or direct quotes"
     ' (marked with quotation marks). Reply "Not applicable" if the excerpt is'
     " irrelevant. At the end of your response, provide an integer score from 1-10 on a"
     " newline indicating relevance to question. Do not explain your score.\n\nRelevant"
     " Information Summary ({summary_length}):"
 )
+# This prompt template integrates with `text` variable of the above `summary_prompt`
+text_with_tables_prompt_template = (
+    "{text}\n\n------------\n\nMarkdown tables from {citation}."
+    " If the markdown is poorly formatted, defer to the images"
+    "\n\n------------\n\n{tables}"
+)
 
 summary_json_prompt = (
-    "Excerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\n"
+    "Excerpt from {citation}\n\n------------\n\n{text}\n\n------------"
+    "\n\nQuestion: {question}\n\n"
 )
 
 # The below "cannot answer" sentinel phrase should:
@@ -45,7 +51,7 @@
 
 qa_prompt = (
     "Answer the question below with the context.\n\n"
-    "Context:\n\n{context}\n\n----\n\n"
+    "Context:\n\n{context}\n\n------------\n\n"
     "Question: {question}\n\n"
     "Write an answer based on the context. "
     "If the context provides insufficient information reply "
@@ -99,15 +105,19 @@
 )
 
 # NOTE: we use double curly braces here so it's not considered an f-string template
-summary_json_system_prompt = """\
-Provide a summary of the relevant information that could help answer the question based on the excerpt. Respond with the following JSON format:
-
-{{
-  "summary": "...",
-  "relevance_score": "..."
-}}
-
-where `summary` is relevant information from the text - {summary_length} words. `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."""
+summary_json_system_prompt = (
+    "Provide a summary of the relevant information"
+    " that could help answer the question based on the excerpt."
+    " Your summary, combined with many others,"
+    " will be given to the model to generate an answer."
+    " Respond with the following JSON format:"
+    '\n\n{{\n  "summary": "...",\n  "relevance_score": "..."\n  "used_images"\n}}'
+    "\n\nwhere `summary` is relevant information from the text - {summary_length} words."
+    " `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."
+    " `used_images` is a boolean flag indicating"
+    " if any images present in a multimodal message were used,"
+    " and if no images were present it should be false."
+)
 
 env_system_prompt = (
     # Matching https://github.com/langchain-ai/langchain/blob/langchain%3D%3D0.2.3/libs/langchain/langchain/agents/openai_functions_agent/base.py#L213-L215