Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,7 @@ will return much faster than the first query and we'll be certain the authors ma
| `answer.evidence_retrieval` | `True` | Use retrieval vs processing all docs. |
| `answer.evidence_summary_length` | `"about 100 words"` | Length of evidence summary. |
| `answer.evidence_skip_summary` | `False` | Whether to skip summarization. |
| `answer.evidence_text_only_fallback` | `False` | Whether to allow context creation to retry without media present. |
| `answer.answer_max_sources` | `5` | Max number of sources for an answer. |
| `answer.max_answer_attempts` | `None` | Max attempts to generate an answer. |
| `answer.answer_length` | `"about 200 words, but can be longer"` | Length of final answer. |
Expand Down
1 change: 1 addition & 0 deletions packages/paper-qa-pypdf/tests/test_paperqa_pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_parse_pdf_to_pages() -> None:
parsed_text = parse_pdf_to_pages(filepath)
assert isinstance(parsed_text.content, dict)
assert "1" in parsed_text.content, "Parsed text should contain page 1"
assert isinstance(parsed_text.content["1"], str)
matches = re.findall(
r"Abstract\nWe introduce PaSa, an advanced Paper ?Search"
r"\nagent powered by large language models.",
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ dev = [
"ipython>=8", # Pin to keep recent
"litellm>=1.68,<1.71", # Lower pin for PydanticDeprecatedSince20 fixes, upper pin for VCR cassette breaks (https://github.com/BerriAI/litellm/issues/11724)
"mypy>=1.8", # Pin for mutable-override
"paper-qa[ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
"paper-qa[image,ldp,pypdf,pymupdf,typing,zotero,local,qdrant]",
"pre-commit>=3.4", # Pin to keep recent
"pydantic~=2.11", # Pin for start of model_fields deprecation
"pylint-pydantic",
Expand All @@ -78,6 +78,9 @@ dev = [
"typeguard",
"vcrpy>=6", # Pin for https://github.com/kevin1024/vcrpy/issues/884
]
image = [
"pillow>=10.3.0", # Pin for py.typed
]
ldp = [
"ldp>=0.25.0", # For new LLM client interface
]
Expand Down
76 changes: 60 additions & 16 deletions src/paperqa/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,21 @@

import contextlib
import json
import logging
import re
from collections.abc import Callable, Sequence
from typing import Any, cast

import litellm
from aviary.core import Message
from lmi import LLMModel

from paperqa.prompts import text_with_tables_prompt_template
from paperqa.types import Context, LLMResult, Text
from paperqa.utils import extract_score, strip_citations

logger = logging.getLogger(__name__)


def llm_parse_json(text: str) -> dict:
"""Read LLM output and extract JSON data from it."""
Expand Down Expand Up @@ -136,6 +141,7 @@ async def map_fxn_summary(
parser: Callable[[str], dict[str, Any]] | None = None,
callbacks: Sequence[Callable[[str], None]] | None = None,
skip_citation_strip: bool = False,
evidence_text_only_fallback: bool = False,
) -> tuple[Context, LLMResult]:
"""Parses the given text and returns a context object with the parser and prompt runner.

Expand All @@ -154,6 +160,8 @@ async def map_fxn_summary(
Should return dict with at least 'summary' field.
callbacks: Optional sequence of callback functions to execute during LLM calls.
skip_citation_strip: Optional skipping of citation stripping, if you want to keep in the context.
evidence_text_only_fallback: Opt-in flag to allow retrying context creation
without media in the completion.

Returns:
The context object and LLMResult to get info about the LLM execution.
Expand All @@ -163,25 +171,61 @@ async def map_fxn_summary(
extras: dict[str, Any] = {}
citation = text.name + ": " + text.doc.formatted_citation
success = False
used_text_only_fallback = False

# Strip newlines in case chunking led to blank lines,
# but not spaces, to preserve text alignment
cleaned_text = text.text.strip("\n")
if summary_llm_model and prompt_templates:
media_text: list[str] = [m.text for m in text.media if m.text]
data = {
"question": question,
"citation": citation,
# Strip newlines in case chunking led to blank lines,
# but not spaces, to preserve text alignment
"text": text.text.strip("\n"),
"text": (
text_with_tables_prompt_template.format(
text=cleaned_text,
citation=citation,
tables="\n\n----\n\n".join(media_text),
)
if media_text
else cleaned_text
),
} | (extra_prompt_data or {})
message_prompt, system_prompt = prompt_templates
messages = [
Message(role="system", content=system_prompt.format(**data)),
Message(role="user", content=message_prompt.format(**data)),
]
llm_result = await summary_llm_model.call_single(
messages=messages,
callbacks=callbacks,
name="evidence:" + text.name,
)
message_prompt, system_prompt = (pt.format(**data) for pt in prompt_templates)
try:
llm_result = await summary_llm_model.call_single(
messages=[
Message(role="system", content=system_prompt),
Message.create_message(
text=message_prompt,
images=(
[i.to_image_url() for i in text.media]
if text.media
else None
),
),
],
callbacks=callbacks,
name="evidence:" + text.name,
)
except litellm.BadRequestError as exc:
if not evidence_text_only_fallback:
raise
logger.warning(
f"LLM call to create a context failed with exception {exc!r}"
f" on text named {text.name!r}"
f" with doc name {text.doc.docname!r} and doc key {text.doc.dockey!r}."
f" Retrying without media."
)
llm_result = await summary_llm_model.call_single(
messages=[
Message(role="system", content=system_prompt),
Message(content=message_prompt),
],
callbacks=callbacks,
name="evidence:" + text.name,
)
used_text_only_fallback = True
context = cast("str", llm_result.text)
result_data = parser(context) if parser else {}
success = bool(result_data)
Expand All @@ -199,9 +243,7 @@ async def map_fxn_summary(
except KeyError:
success = False
else:
# Strip newlines in case chunking led to blank lines,
# but not spaces, to preserve text alignment
context = text.text.strip("\n")
context = cleaned_text
# If we don't assign scores, just default to 5.
# why 5? Because we filter out 0s in another place
# and 5/10 is the other default I could come up with
Expand All @@ -213,6 +255,8 @@ async def map_fxn_summary(

if not success:
score = extract_score(context)
if used_text_only_fallback:
extras["used_images"] = False

return (
Context(
Expand Down
6 changes: 4 additions & 2 deletions src/paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,17 +380,18 @@ async def aadd( # noqa: PLR0912
doc, **(query_kwargs | kwargs)
)

texts = await read_doc(
texts, metadata = await read_doc(
path,
doc,
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
use_block_parsing=parse_config.pdfs_use_block_parsing,
parse_pdf=parse_config.parse_pdf,
include_metadata=True,
)
# loose check to see if document was loaded
if (
if metadata.parse_type != "image" and (
not texts
or len(texts[0].text) < 10 # noqa: PLR2004
or (
Expand Down Expand Up @@ -669,6 +670,7 @@ async def aget_evidence(
parser=llm_parse_json if prompt_config.use_json else None,
callbacks=callbacks,
skip_citation_strip=answer_config.skip_evidence_citation_strip,
evidence_text_only_fallback=answer_config.evidence_text_only_fallback,
)
for m in matches
],
Expand Down
38 changes: 24 additions & 14 deletions src/paperqa/prompts.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
from datetime import datetime

# ruff: noqa: E501

summary_prompt = (
"Summarize the excerpt below to help answer a question.\n\nExcerpt from"
" {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\nDo not directly"
" {citation}\n\n------------\n\n{text}\n\n------------"
"\n\nQuestion: {question}\n\nDo not directly"
" answer the question, instead summarize to give evidence to help answer the"
" question. Stay detailed; report specific numbers, equations, or direct quotes"
' (marked with quotation marks). Reply "Not applicable" if the excerpt is'
" irrelevant. At the end of your response, provide an integer score from 1-10 on a"
" newline indicating relevance to question. Do not explain your score.\n\nRelevant"
" Information Summary ({summary_length}):"
)
# This prompt template integrates with `text` variable of the above `summary_prompt`
text_with_tables_prompt_template = (
"{text}\n\n------------\n\nMarkdown tables from {citation}."
" If the markdown is poorly formatted, defer to the images"
"\n\n------------\n\n{tables}"
)

summary_json_prompt = (
"Excerpt from {citation}\n\n----\n\n{text}\n\n----\n\nQuestion: {question}\n\n"
"Excerpt from {citation}\n\n------------\n\n{text}\n\n------------"
"\n\nQuestion: {question}\n\n"
)

# The below "cannot answer" sentinel phrase should:
Expand Down Expand Up @@ -45,7 +51,7 @@

qa_prompt = (
"Answer the question below with the context.\n\n"
"Context:\n\n{context}\n\n----\n\n"
"Context:\n\n{context}\n\n------------\n\n"
"Question: {question}\n\n"
"Write an answer based on the context. "
"If the context provides insufficient information reply "
Expand Down Expand Up @@ -99,15 +105,19 @@
)

# NOTE: we use double curly braces here so it's not considered an f-string template
summary_json_system_prompt = """\
Provide a summary of the relevant information that could help answer the question based on the excerpt. Respond with the following JSON format:

{{
"summary": "...",
"relevance_score": "..."
}}

where `summary` is relevant information from the text - {summary_length} words. `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."""
summary_json_system_prompt = (
"Provide a summary of the relevant information"
" that could help answer the question based on the excerpt."
" Your summary, combined with many others,"
" will be given to the model to generate an answer."
" Respond with the following JSON format:"
'\n\n{{\n "summary": "...",\n "relevance_score": "..."\n "used_images"\n}}'
"\n\nwhere `summary` is relevant information from the text - {summary_length} words."
" `relevance_score` is an integer 1-10 for the relevance of `summary` to the question."
" `used_images` is a boolean flag indicating"
" if any images present in a multimodal message were used,"
" and if no images were present it should be false."
)

env_system_prompt = (
# Matching https://github.com/langchain-ai/langchain/blob/langchain%3D%3D0.2.3/libs/langchain/langchain/agents/openai_functions_agent/base.py#L213-L215
Expand Down
Loading