Skip to content

Commit 1236b7e

Browse files
committed
Refactored out PyMuPDF to its own plugin package
1 parent 6e9a64b commit 1236b7e

File tree

15 files changed

+981
-166
lines changed

15 files changed

+981
-166
lines changed

.github/workflows/build.yml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,29 @@ jobs:
1010
runs-on: ubuntu-latest
1111
steps:
1212
- uses: actions/checkout@v4
13-
- id: build
13+
- id: build-paper-qa-pymupdf
1414
uses: hynek/build-and-inspect-python-package@v2
15-
- name: Download built artifact to dist/
15+
with:
16+
path: packages/paper-qa-pymupdf
17+
upload-name-suffix: -paper-qa-pymupdf
18+
- name: Download built paper-qa-pymupdf artifact to dist/
19+
uses: actions/download-artifact@v4
20+
with:
21+
name: ${{ steps.build-paper-qa-pymupdf.outputs.artifact-name }}
22+
path: dist
23+
- name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
24+
run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
25+
- id: build-paper-qa
26+
uses: hynek/build-and-inspect-python-package@v2
27+
with:
28+
upload-name-suffix: -paper-qa
29+
- name: Download built paper-qa artifact to dist/
1630
uses: actions/download-artifact@v4
1731
with:
18-
name: ${{ steps.build.outputs.artifact-name }}
32+
name: ${{ steps.build-paper-qa.outputs.artifact-name }}
1933
path: dist
34+
- name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
35+
run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
2036
- uses: pypa/gh-action-pypi-publish@release/v1
2137
with:
2238
password: ${{ secrets.PYPI_API_TOKEN }}

.github/workflows/tests.yml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,27 @@ jobs:
3434
with:
3535
enable-cache: true
3636
- run: uv python pin ${{ matrix.python-version }}
37-
- uses: hynek/build-and-inspect-python-package@v2
37+
- name: Check paper-qa-pymupdf build
38+
id: build-paper-qa-pymupdf
39+
if: matrix.python-version == '3.11'
40+
uses: hynek/build-and-inspect-python-package@v2
41+
with:
42+
path: packages/paper-qa-pymupdf
43+
upload-name-suffix: -paper-qa-pymupdf
44+
- name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
45+
if: matrix.python-version == '3.11'
46+
run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
47+
- name: Check paper-qa build
48+
id: build-paper-qa
49+
if: matrix.python-version == '3.11'
50+
uses: hynek/build-and-inspect-python-package@v2
51+
with:
52+
upload-name-suffix: -paper-qa
53+
- name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
54+
if: matrix.python-version == '3.11'
55+
run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
3856
- run: uv sync --python-preference=only-managed
39-
- run: uv run pylint src
57+
- run: uv run pylint src packages
4058
- uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.1.1
4159
test:
4260
runs-on: ubuntu-latest

README.md

Lines changed: 66 additions & 66 deletions
Large diffs are not rendered by default.

packages/paper-qa-pymupdf/LICENSE

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# paper-qa-pymupdf
2+
3+
[![GitHub](https://img.shields.io/badge/github-%23121011.svg?logo=github&logoColor=white)](https://github.com/Future-House/paper-qa/tree/main/packages/paper-qa-pymupdf)
4+
[![PyPI version](https://badge.fury.io/py/paper-qa-pymupdf.svg)](https://badge.fury.io/py/paper-qa-pymupdf)
5+
[![tests](https://github.com/Future-House/paper-qa/actions/workflows/tests.yml/badge.svg)](https://github.com/Future-House/paper-qa)
6+
![License](https://img.shields.io/badge/license-AGPLv3-blue.svg)
7+
![PyPI Python Versions](https://img.shields.io/pypi/pyversions/paper-qa-pymupdf)
8+
9+
PDF reading code backed by
10+
[PyMuPDF](https://github.com/pymupdf/PyMuPDF).
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
[build-system]
2+
build-backend = "setuptools.build_meta"
3+
requires = ["setuptools>=64", "setuptools_scm>=8"]
4+
5+
[project]
6+
authors = [
7+
{email = "hello@futurehouse.org", name = "FutureHouse technical staff"},
8+
]
9+
classifiers = [
10+
"Intended Audience :: Developers",
11+
"License :: OSI Approved :: GNU Affero General Public License v3",
12+
"Operating System :: OS Independent",
13+
"Programming Language :: Python :: 3 :: Only",
14+
"Programming Language :: Python :: 3.11",
15+
"Programming Language :: Python :: 3.12",
16+
"Programming Language :: Python :: 3.13",
17+
"Programming Language :: Python",
18+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
19+
]
20+
dependencies = [
21+
"PyMuPDF>=1.24.12", # For pymupdf.set_messages addition
22+
"paper-qa",
23+
]
24+
description = "PaperQA readers implemented using PyMuPDF"
25+
dynamic = ["version"]
26+
license = {file = "LICENSE"}
27+
maintainers = [
28+
{email = "jamesbraza@gmail.com", name = "James Braza"},
29+
{email = "michael.skarlinski@gmail.com", name = "Michael Skarlinski"},
30+
{email = "white.d.andrew@gmail.com", name = "Andrew White"},
31+
]
32+
name = "paper-qa-pymupdf"
33+
readme = "README.md"
34+
requires-python = ">=3.11"
35+
36+
[tool.ruff]
37+
extend = "../../pyproject.toml"
38+
39+
[tool.setuptools.packages.find]
40+
where = ["src"]
41+
42+
[tool.setuptools_scm]
43+
root = "../.."
44+
version_file = "src/paperqa_pymupdf/version.py"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .reader import BLOCK_TEXT_INDEX, parse_pdf_to_pages, setup_pymupdf_python_logging
2+
3+
__all__ = [
4+
"BLOCK_TEXT_INDEX",
5+
"parse_pdf_to_pages",
6+
"setup_pymupdf_python_logging",
7+
]
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import os
2+
3+
import pymupdf
4+
from paperqa.types import ParsedMetadata, ParsedText
5+
from paperqa.utils import ImpossibleParsingError
6+
from paperqa.version import __version__ as pqa_version
7+
8+
9+
def setup_pymupdf_python_logging() -> None:
10+
"""
11+
Configure PyMuPDF to use Python logging.
12+
13+
SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
14+
"""
15+
pymupdf.set_messages(pylogging=True)
16+
17+
18+
BLOCK_TEXT_INDEX = 4
19+
20+
21+
def parse_pdf_to_pages(
22+
path: str | os.PathLike,
23+
page_size_limit: int | None = None,
24+
use_block_parsing: bool = False,
25+
**_,
26+
) -> ParsedText:
27+
28+
with pymupdf.open(path) as file:
29+
pages: dict[str, str] = {}
30+
total_length = 0
31+
32+
for i in range(file.page_count):
33+
try:
34+
page = file.load_page(i)
35+
except pymupdf.mupdf.FzErrorFormat as exc:
36+
raise ImpossibleParsingError(
37+
f"Page loading via {pymupdf.__name__} failed on page {i} of"
38+
f" {file.page_count} for the PDF at path {path}, likely this PDF"
39+
" file is corrupt."
40+
) from exc
41+
42+
if use_block_parsing:
43+
# NOTE: this block-based parsing appears to be better, but until
44+
# fully validated on 1+ benchmarks, it's considered experimental
45+
46+
# Extract text blocks from the page
47+
# Note: sort=False is important to preserve the order of text blocks
48+
# as they appear in the PDF
49+
blocks = page.get_text("blocks", sort=False)
50+
51+
# Concatenate text blocks into a single string
52+
text = "\n".join(
53+
block[BLOCK_TEXT_INDEX]
54+
for block in blocks
55+
if len(block) > BLOCK_TEXT_INDEX
56+
)
57+
else:
58+
text = page.get_text("text", sort=True)
59+
60+
if page_size_limit and len(text) > page_size_limit:
61+
raise ImpossibleParsingError(
62+
f"The text in page {i} of {file.page_count} was {len(text)} chars"
63+
f" long, which exceeds the {page_size_limit} char limit for the PDF"
64+
f" at path {path}."
65+
)
66+
pages[str(i + 1)] = text
67+
total_length += len(text)
68+
69+
metadata = ParsedMetadata(
70+
parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
71+
paperqa_version=pqa_version,
72+
total_parsed_text_length=total_length,
73+
parse_type="pdf",
74+
)
75+
return ParsedText(content=pages, metadata=metadata)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pathlib import Path
2+
3+
from paperqa_pymupdf import parse_pdf_to_pages
4+
5+
REPO_ROOT = Path(__file__).parents[3]
6+
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
7+
8+
9+
def test_parse_pdf_to_pages() -> None:
10+
filepath = STUB_DATA_DIR / "pasa.pdf"
11+
parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
12+
assert isinstance(parsed_text.content, dict)
13+
assert "1" in parsed_text.content, "Parsed text should contain page 1"
14+
assert (
15+
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
16+
"\nagent powered by large language models."
17+
) in parsed_text.content["1"]

pyproject.toml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ classifiers = [
2424
"Topic :: Scientific/Engineering :: Artificial Intelligence",
2525
]
2626
dependencies = [
27-
"PyMuPDF>=1.24.12", # For pymupdf.set_messages addition
2827
"aiohttp>=3.10.6", # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
2928
"anyio",
3029
"fhaviary[llm]>=0.20", # For Environment.get_id
3130
"fhlmi>=0.25.4", # For LLM reasoning
3231
"html2text", # TODO: evaluate moving to an opt-in dependency
3332
"httpx",
3433
"numpy",
34+
"paper-qa-pymupdf",
3535
"pybtex",
3636
"pydantic-settings",
3737
"pydantic~=2.0,>=2.10.1", # Pin 2.10 for typing breaks
@@ -96,6 +96,7 @@ typing = [
9696
"types-setuptools",
9797
]
9898
zotero = [
99+
"paper-qa-pymupdf",
99100
"pyzotero",
100101
]
101102

@@ -155,6 +156,11 @@ error_summary = false
155156
exclude = [
156157
"^\\.?venv", # SEE: https://regex101.com/r/0rp5Br/1
157158
]
159+
# Specifies the paths to use, after trying the paths from MYPYPATH environment variable.
160+
# Useful if you'd like to keep stubs in your repo, along with the config file.
161+
# Multiple paths are always separated with a : or , regardless of the platform.
162+
# User home directory and environment variables will be expanded.
163+
mypy_path = "$MYPY_CONFIG_FILE_DIR/src,$MYPY_CONFIG_FILE_DIR/packages/paper-qa-pymupdf/src"
158164
# Specifies the OS platform for the target program, for example darwin or win32
159165
# (meaning OS X or Windows, respectively). The default is the current platform
160166
# as revealed by Python’s sys.platform variable.
@@ -596,3 +602,10 @@ extend-exclude = [
596602
"tests/cassettes/**",
597603
"tests/stub_data/**",
598604
]
605+
606+
[tool.uv.sources]
607+
paper-qa = {workspace = true}
608+
paper-qa-pymupdf = {workspace = true}
609+
610+
[tool.uv.workspace]
611+
members = ["packages/*"]

0 commit comments

Comments
 (0)