Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,29 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- id: build
- id: build-paper-qa-pymupdf
uses: hynek/build-and-inspect-python-package@v2
- name: Download built artifact to dist/
with:
path: packages/paper-qa-pymupdf
upload-name-suffix: -paper-qa-pymupdf
- name: Download built paper-qa-pymupdf artifact to dist/
uses: actions/download-artifact@v4
with:
name: ${{ steps.build-paper-qa-pymupdf.outputs.artifact-name }}
path: dist
- name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
- id: build-paper-qa
uses: hynek/build-and-inspect-python-package@v2
with:
upload-name-suffix: -paper-qa
- name: Download built paper-qa artifact to dist/
uses: actions/download-artifact@v4
with:
name: ${{ steps.build.outputs.artifact-name }}
name: ${{ steps.build-paper-qa.outputs.artifact-name }}
path: dist
- name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
- uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
22 changes: 20 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,27 @@ jobs:
with:
enable-cache: true
- run: uv python pin ${{ matrix.python-version }}
- uses: hynek/build-and-inspect-python-package@v2
- name: Check paper-qa-pymupdf build
id: build-paper-qa-pymupdf
if: matrix.python-version == '3.11'
uses: hynek/build-and-inspect-python-package@v2
with:
path: packages/paper-qa-pymupdf
upload-name-suffix: -paper-qa-pymupdf
- name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
if: matrix.python-version == '3.11'
run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
- name: Check paper-qa build
id: build-paper-qa
if: matrix.python-version == '3.11'
uses: hynek/build-and-inspect-python-package@v2
with:
upload-name-suffix: -paper-qa
- name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
if: matrix.python-version == '3.11'
run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
- run: uv sync --python-preference=only-managed
- run: uv run pylint paperqa
- run: uv run pylint src packages
- uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.1.1
test:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -311,4 +311,4 @@ tests/example2.*
!tests/stub_data/.DS_Store

# Client data
paperqa/clients/client_data/retractions.csv
src/paperqa/clients/client_data/retractions.csv
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ repos:
- id: check-added-large-files
exclude: |
(?x)^(
paperqa/clients/client_data.*|
src/paperqa/clients/client_data.*|
tests/stub_data.*
)$
- id: check-byte-order-marker
Expand Down
134 changes: 67 additions & 67 deletions README.md

Large diffs are not rendered by default.

661 changes: 661 additions & 0 deletions packages/paper-qa-pymupdf/LICENSE

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions packages/paper-qa-pymupdf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# paper-qa-pymupdf

[![GitHub](https://img.shields.io/badge/github-%23121011.svg?logo=github&logoColor=white)](https://github.com/Future-House/paper-qa/tree/main/packages/paper-qa-pymupdf)
[![PyPI version](https://badge.fury.io/py/paper-qa-pymupdf.svg)](https://badge.fury.io/py/paper-qa-pymupdf)
[![tests](https://github.com/Future-House/paper-qa/actions/workflows/tests.yml/badge.svg)](https://github.com/Future-House/paper-qa)
![License](https://img.shields.io/badge/license-AGPLv3-blue.svg)
![PyPI Python Versions](https://img.shields.io/pypi/pyversions/paper-qa-pymupdf)

PDF reading code backed by
[PyMuPDF](https://github.com/pymupdf/PyMuPDF).
44 changes: 44 additions & 0 deletions packages/paper-qa-pymupdf/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[build-system]
build-backend = "setuptools.build_meta"
requires = ["setuptools>=64", "setuptools_scm>=8"]

[project]
authors = [
{email = "hello@futurehouse.org", name = "FutureHouse technical staff"},
]
classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU Affero General Public License v3",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"PyMuPDF>=1.24.12", # For pymupdf.set_messages addition
"paper-qa",
]
description = "PaperQA readers implemented using PyMuPDF"
dynamic = ["version"]
license = {file = "LICENSE"}
maintainers = [
{email = "jamesbraza@gmail.com", name = "James Braza"},
{email = "michael.skarlinski@gmail.com", name = "Michael Skarlinski"},
{email = "white.d.andrew@gmail.com", name = "Andrew White"},
]
name = "paper-qa-pymupdf"
readme = "README.md"
requires-python = ">=3.11"

[tool.ruff]
extend = "../../pyproject.toml"

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools_scm]
root = "../.."
version_file = "src/paperqa_pymupdf/version.py"
7 changes: 7 additions & 0 deletions packages/paper-qa-pymupdf/src/paperqa_pymupdf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .reader import BLOCK_TEXT_INDEX, parse_pdf_to_pages, setup_pymupdf_python_logging

__all__ = [
"BLOCK_TEXT_INDEX",
"parse_pdf_to_pages",
"setup_pymupdf_python_logging",
]
75 changes: 75 additions & 0 deletions packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os

import pymupdf
from paperqa.types import ParsedMetadata, ParsedText
from paperqa.utils import ImpossibleParsingError
from paperqa.version import __version__ as pqa_version


def setup_pymupdf_python_logging() -> None:
"""
Configure PyMuPDF to use Python logging.

SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
"""
pymupdf.set_messages(pylogging=True)


BLOCK_TEXT_INDEX = 4


def parse_pdf_to_pages(
path: str | os.PathLike,
page_size_limit: int | None = None,
use_block_parsing: bool = False,
**_,
) -> ParsedText:

with pymupdf.open(path) as file:
pages: dict[str, str] = {}
total_length = 0

for i in range(file.page_count):
try:
page = file.load_page(i)
except pymupdf.mupdf.FzErrorFormat as exc:
raise ImpossibleParsingError(
f"Page loading via {pymupdf.__name__} failed on page {i} of"
f" {file.page_count} for the PDF at path {path}, likely this PDF"
" file is corrupt."
) from exc

if use_block_parsing:
# NOTE: this block-based parsing appears to be better, but until
# fully validated on 1+ benchmarks, it's considered experimental

# Extract text blocks from the page
# Note: sort=False is important to preserve the order of text blocks
# as they appear in the PDF
blocks = page.get_text("blocks", sort=False)

# Concatenate text blocks into a single string
text = "\n".join(
block[BLOCK_TEXT_INDEX]
for block in blocks
if len(block) > BLOCK_TEXT_INDEX
)
else:
text = page.get_text("text", sort=True)

if page_size_limit and len(text) > page_size_limit:
raise ImpossibleParsingError(
f"The text in page {i} of {file.page_count} was {len(text)} chars"
f" long, which exceeds the {page_size_limit} char limit for the PDF"
f" at path {path}."
)
pages[str(i + 1)] = text
total_length += len(text)

metadata = ParsedMetadata(
parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
paperqa_version=pqa_version,
total_parsed_text_length=total_length,
parse_type="pdf",
)
return ParsedText(content=pages, metadata=metadata)
17 changes: 17 additions & 0 deletions packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pathlib import Path

from paperqa_pymupdf import parse_pdf_to_pages

REPO_ROOT = Path(__file__).parents[3]
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"


def test_parse_pdf_to_pages() -> None:
filepath = STUB_DATA_DIR / "pasa.pdf"
parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
assert isinstance(parsed_text.content, dict)
assert "1" in parsed_text.content, "Parsed text should contain page 1"
assert (
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
"\nagent powered by large language models."
) in parsed_text.content["1"]
25 changes: 19 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"PyMuPDF>=1.24.12", # For pymupdf.set_messages addition
"aiohttp>=3.10.6", # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
"anyio",
"fhaviary[llm]>=0.20", # For Environment.get_id
"fhlmi>=0.25.4", # For LLM reasoning
"html2text", # TODO: evaluate moving to an opt-in dependency
"httpx",
"numpy",
"paper-qa-pymupdf",
"pybtex",
"pydantic-settings",
"pydantic~=2.0,>=2.10.1", # Pin 2.10 for typing breaks
Expand Down Expand Up @@ -96,6 +96,7 @@ typing = [
"types-setuptools",
]
zotero = [
"paper-qa-pymupdf",
"pyzotero",
]

Expand All @@ -115,7 +116,7 @@ check-hidden = true
ignore-words-list = "aadd,astroid,ser,ECT"
skip = [
"docs/2024-10-16_litqa2-splits.json5",
"paperqa/clients/client_data/*",
"src/paperqa/clients/client_data/*",
"tests/cassettes/*",
"tests/stub_data/*",
]
Expand Down Expand Up @@ -155,6 +156,11 @@ error_summary = false
exclude = [
"^\\.?venv", # SEE: https://regex101.com/r/0rp5Br/1
]
# Specifies the paths to use, after trying the paths from MYPYPATH environment variable.
# Useful if you'd like to keep stubs in your repo, along with the config file.
# Multiple paths are always separated with a : or , regardless of the platform.
# User home directory and environment variables will be expanded.
mypy_path = "$MYPY_CONFIG_FILE_DIR/src,$MYPY_CONFIG_FILE_DIR/packages/paper-qa-pymupdf/src"
# Specifies the OS platform for the target program, for example darwin or win32
# (meaning OS X or Windows, respectively). The default is the current platform
# as revealed by Python’s sys.platform variable.
Expand Down Expand Up @@ -306,7 +312,7 @@ filterwarnings = [
# files or test ids are given in the command line when executing pytest from the rootdir
# directory. File system paths may use shell-style wildcards, including the recursive **
# pattern.
testpaths = ["tests"]
testpaths = ["packages", "tests"]

[tool.refurb]
enable_all = true
Expand Down Expand Up @@ -571,10 +577,10 @@ convention = "google"
paperqa = ["configs/**json"]

[tool.setuptools.packages.find]
include = ["paperqa*"]
where = ["src"]

[tool.setuptools_scm]
version_file = "paperqa/version.py"
version_file = "src/paperqa/version.py"

[tool.tomlsort]
all = true
Expand All @@ -592,7 +598,14 @@ aadd = "aadd"

[tool.typos.files]
extend-exclude = [
"paperqa/clients/client_data/journal_quality.csv",
"src/paperqa/clients/client_data/journal_quality.csv",
"tests/cassettes/**",
"tests/stub_data/**",
]

[tool.uv.sources]
paper-qa = {workspace = true}
paper-qa-pymupdf = {workspace = true}

[tool.uv.workspace]
members = ["packages/*"]
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 2 additions & 1 deletion paperqa/contrib/zotero.py → src/paperqa/contrib/zotero.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import cast

from paperqa_pymupdf import parse_pdf_to_pages
from pydantic import BaseModel

try:
Expand All @@ -15,7 +16,7 @@
" `pip install paper-qa[zotero]`."
) from e
from paperqa.paths import PAPERQA_DIR
from paperqa.readers import PDFParserFn, parse_pdf_to_pages
from paperqa.readers import PDFParserFn


class ZoteroPaper(BaseModel):
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading