Refactored out PyMuPDF to its own plugin package

jamesbraza · jamesbraza · commit 1236b7e66aff · 2025-07-22T22:12:59.000-07:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,13 +10,29 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - id: build
+      - id: build-paper-qa-pymupdf
         uses: hynek/build-and-inspect-python-package@v2
-      - name: Download built artifact to dist/
+        with:
+          path: packages/paper-qa-pymupdf
+          upload-name-suffix: -paper-qa-pymupdf
+      - name: Download built paper-qa-pymupdf artifact to dist/
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ steps.build-paper-qa-pymupdf.outputs.artifact-name }}
+          path: dist
+      - name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
+      - id: build-paper-qa
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          upload-name-suffix: -paper-qa
+      - name: Download built paper-qa artifact to dist/
         uses: actions/download-artifact@v4
         with:
-          name: ${{ steps.build.outputs.artifact-name }}
+          name: ${{ steps.build-paper-qa.outputs.artifact-name }}
           path: dist
+      - name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
       - uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -34,9 +34,27 @@ jobs:
         with:
           enable-cache: true
       - run: uv python pin ${{ matrix.python-version }}
-      - uses: hynek/build-and-inspect-python-package@v2
+      - name: Check paper-qa-pymupdf build
+        id: build-paper-qa-pymupdf
+        if: matrix.python-version == '3.11'
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          path: packages/paper-qa-pymupdf
+          upload-name-suffix: -paper-qa-pymupdf
+      - name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        if: matrix.python-version == '3.11'
+        run: rm -r ${{ steps.build-paper-qa-pymupdf.outputs.dist }}
+      - name: Check paper-qa build
+        id: build-paper-qa
+        if: matrix.python-version == '3.11'
+        uses: hynek/build-and-inspect-python-package@v2
+        with:
+          upload-name-suffix: -paper-qa
+      - name: Clean up paper-qa build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
+        if: matrix.python-version == '3.11'
+        run: rm -r ${{ steps.build-paper-qa.outputs.dist }}
       - run: uv sync --python-preference=only-managed
-      - run: uv run pylint src
+      - run: uv run pylint src packages
       - uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.1.1
   test:
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
diff --git a/packages/paper-qa-pymupdf/LICENSE b/packages/paper-qa-pymupdf/LICENSE
diff --git a/packages/paper-qa-pymupdf/README.md b/packages/paper-qa-pymupdf/README.md
@@ -0,0 +1,10 @@
+# paper-qa-pymupdf
+
+[![GitHub](https://img.shields.io/badge/github-%23121011.svg?logo=github&logoColor=white)](https://github.com/Future-House/paper-qa/tree/main/packages/paper-qa-pymupdf)
+[![PyPI version](https://badge.fury.io/py/paper-qa-pymupdf.svg)](https://badge.fury.io/py/paper-qa-pymupdf)
+[![tests](https://github.com/Future-House/paper-qa/actions/workflows/tests.yml/badge.svg)](https://github.com/Future-House/paper-qa)
+![License](https://img.shields.io/badge/license-AGPLv3-blue.svg)
+![PyPI Python Versions](https://img.shields.io/pypi/pyversions/paper-qa-pymupdf)
+
+PDF reading code backed by
+[PyMuPDF](https://github.com/pymupdf/PyMuPDF).
diff --git a/packages/paper-qa-pymupdf/pyproject.toml b/packages/paper-qa-pymupdf/pyproject.toml
@@ -0,0 +1,44 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools>=64", "setuptools_scm>=8"]
+
+[project]
+authors = [
+    {email = "hello@futurehouse.org", name = "FutureHouse technical staff"},
+]
+classifiers = [
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: GNU Affero General Public License v3",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "PyMuPDF>=1.24.12",  # For pymupdf.set_messages addition
+    "paper-qa",
+]
+description = "PaperQA readers implemented using PyMuPDF"
+dynamic = ["version"]
+license = {file = "LICENSE"}
+maintainers = [
+    {email = "jamesbraza@gmail.com", name = "James Braza"},
+    {email = "michael.skarlinski@gmail.com", name = "Michael Skarlinski"},
+    {email = "white.d.andrew@gmail.com", name = "Andrew White"},
+]
+name = "paper-qa-pymupdf"
+readme = "README.md"
+requires-python = ">=3.11"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools_scm]
+root = "../.."
+version_file = "src/paperqa_pymupdf/version.py"
diff --git a/packages/paper-qa-pymupdf/src/paperqa_pymupdf/__init__.py b/packages/paper-qa-pymupdf/src/paperqa_pymupdf/__init__.py
@@ -0,0 +1,7 @@
+from .reader import BLOCK_TEXT_INDEX, parse_pdf_to_pages, setup_pymupdf_python_logging
+
+__all__ = [
+    "BLOCK_TEXT_INDEX",
+    "parse_pdf_to_pages",
+    "setup_pymupdf_python_logging",
+]
diff --git a/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py b/packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
@@ -0,0 +1,75 @@
+import os
+
+import pymupdf
+from paperqa.types import ParsedMetadata, ParsedText
+from paperqa.utils import ImpossibleParsingError
+from paperqa.version import __version__ as pqa_version
+
+
+def setup_pymupdf_python_logging() -> None:
+    """
+    Configure PyMuPDF to use Python logging.
+
+    SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
+    """
+    pymupdf.set_messages(pylogging=True)
+
+
+BLOCK_TEXT_INDEX = 4
+
+
+def parse_pdf_to_pages(
+    path: str | os.PathLike,
+    page_size_limit: int | None = None,
+    use_block_parsing: bool = False,
+    **_,
+) -> ParsedText:
+
+    with pymupdf.open(path) as file:
+        pages: dict[str, str] = {}
+        total_length = 0
+
+        for i in range(file.page_count):
+            try:
+                page = file.load_page(i)
+            except pymupdf.mupdf.FzErrorFormat as exc:
+                raise ImpossibleParsingError(
+                    f"Page loading via {pymupdf.__name__} failed on page {i} of"
+                    f" {file.page_count} for the PDF at path {path}, likely this PDF"
+                    " file is corrupt."
+                ) from exc
+
+            if use_block_parsing:
+                # NOTE: this block-based parsing appears to be better, but until
+                # fully validated on 1+ benchmarks, it's considered experimental
+
+                # Extract text blocks from the page
+                # Note: sort=False is important to preserve the order of text blocks
+                # as they appear in the PDF
+                blocks = page.get_text("blocks", sort=False)
+
+                # Concatenate text blocks into a single string
+                text = "\n".join(
+                    block[BLOCK_TEXT_INDEX]
+                    for block in blocks
+                    if len(block) > BLOCK_TEXT_INDEX
+                )
+            else:
+                text = page.get_text("text", sort=True)
+
+            if page_size_limit and len(text) > page_size_limit:
+                raise ImpossibleParsingError(
+                    f"The text in page {i} of {file.page_count} was {len(text)} chars"
+                    f" long, which exceeds the {page_size_limit} char limit for the PDF"
+                    f" at path {path}."
+                )
+            pages[str(i + 1)] = text
+            total_length += len(text)
+
+    metadata = ParsedMetadata(
+        parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
+        paperqa_version=pqa_version,
+        total_parsed_text_length=total_length,
+        parse_type="pdf",
+    )
+    return ParsedText(content=pages, metadata=metadata)
diff --git a/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py b/packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+from paperqa_pymupdf import parse_pdf_to_pages
+
+REPO_ROOT = Path(__file__).parents[3]
+STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"
+
+
+def test_parse_pdf_to_pages() -> None:
+    filepath = STUB_DATA_DIR / "pasa.pdf"
+    parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
+    assert isinstance(parsed_text.content, dict)
+    assert "1" in parsed_text.content, "Parsed text should contain page 1"
+    assert (
+        "Abstract\n\nWe introduce PaSa, an advanced Paper Search"
+        "\nagent powered by large language models."
+    ) in parsed_text.content["1"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,14 +24,14 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dependencies = [
-    "PyMuPDF>=1.24.12",  # For pymupdf.set_messages addition
     "aiohttp>=3.10.6",  # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
     "anyio",
     "fhaviary[llm]>=0.20",  # For Environment.get_id
     "fhlmi>=0.25.4",  # For LLM reasoning
     "html2text",  # TODO: evaluate moving to an opt-in dependency
     "httpx",
     "numpy",
+    "paper-qa-pymupdf",
     "pybtex",
     "pydantic-settings",
     "pydantic~=2.0,>=2.10.1",  # Pin 2.10 for typing breaks
@@ -96,6 +96,7 @@ typing = [
     "types-setuptools",
 ]
 zotero = [
+    "paper-qa-pymupdf",
     "pyzotero",
 ]
 
@@ -155,6 +156,11 @@ error_summary = false
 exclude = [
     "^\\.?venv",  # SEE: https://regex101.com/r/0rp5Br/1
 ]
+# Specifies the paths to use, after trying the paths from MYPYPATH environment variable.
+# Useful if you'd like to keep stubs in your repo, along with the config file.
+# Multiple paths are always separated with a : or , regardless of the platform.
+# User home directory and environment variables will be expanded.
+mypy_path = "$MYPY_CONFIG_FILE_DIR/src,$MYPY_CONFIG_FILE_DIR/packages/paper-qa-pymupdf/src"
 # Specifies the OS platform for the target program, for example darwin or win32
 # (meaning OS X or Windows, respectively). The default is the current platform
 # as revealed by Python’s sys.platform variable.
@@ -596,3 +602,10 @@ extend-exclude = [
     "tests/cassettes/**",
     "tests/stub_data/**",
 ]
+
+[tool.uv.sources]
+paper-qa = {workspace = true}
+paper-qa-pymupdf = {workspace = true}
+
+[tool.uv.workspace]
+members = ["packages/*"]
diff --git a/src/paperqa/contrib/zotero.py b/src/paperqa/contrib/zotero.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import cast
 
+from paperqa_pymupdf import parse_pdf_to_pages
 from pydantic import BaseModel
 
 try:
@@ -15,7 +16,7 @@
         " `pip install paper-qa[zotero]`."
     ) from e
 from paperqa.paths import PAPERQA_DIR
-from paperqa.readers import PDFParserFn, parse_pdf_to_pages
+from paperqa.readers import PDFParserFn
 
 
 class ZoteroPaper(BaseModel):
diff --git a/src/paperqa/readers.py b/src/paperqa/readers.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 from typing import Literal, Protocol, cast, overload, runtime_checkable
 
-import pymupdf
 import tiktoken
 from html2text import __version__ as html2text_version
 from html2text import html2text
@@ -22,15 +21,6 @@
 from paperqa.version import __version__ as pqa_version
 
 
-def setup_pymupdf_python_logging() -> None:
-    """
-    Configure PyMuPDF to use Python logging.
-
-    SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics
-    """
-    pymupdf.set_messages(pylogging=True)
-
-
 @runtime_checkable
 class PDFParserFn(Protocol):
     """Protocol for parsing a PDF."""
@@ -40,66 +30,6 @@ def __call__(
     ) -> ParsedText: ...
 
 
-BLOCK_TEXT_INDEX = 4
-
-
-def parse_pdf_to_pages(
-    path: str | os.PathLike,
-    page_size_limit: int | None = None,
-    use_block_parsing: bool = False,
-    **_,
-) -> ParsedText:
-
-    with pymupdf.open(path) as file:
-        pages: dict[str, str] = {}
-        total_length = 0
-
-        for i in range(file.page_count):
-            try:
-                page = file.load_page(i)
-            except pymupdf.mupdf.FzErrorFormat as exc:
-                raise ImpossibleParsingError(
-                    f"Page loading via {pymupdf.__name__} failed on page {i} of"
-                    f" {file.page_count} for the PDF at path {path}, likely this PDF"
-                    " file is corrupt."
-                ) from exc
-
-            if use_block_parsing:
-                # NOTE: this block-based parsing appears to be better, but until
-                # fully validated on 1+ benchmarks, it's considered experimental
-
-                # Extract text blocks from the page
-                # Note: sort=False is important to preserve the order of text blocks
-                # as they appear in the PDF
-                blocks = page.get_text("blocks", sort=False)
-
-                # Concatenate text blocks into a single string
-                text = "\n".join(
-                    block[BLOCK_TEXT_INDEX]
-                    for block in blocks
-                    if len(block) > BLOCK_TEXT_INDEX
-                )
-            else:
-                text = page.get_text("text", sort=True)
-
-            if page_size_limit and len(text) > page_size_limit:
-                raise ImpossibleParsingError(
-                    f"The text in page {i} of {file.page_count} was {len(text)} chars"
-                    f" long, which exceeds the {page_size_limit} char limit for the PDF"
-                    f" at path {path}."
-                )
-            pages[str(i + 1)] = text
-            total_length += len(text)
-
-    metadata = ParsedMetadata(
-        parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
-        paperqa_version=pqa_version,
-        total_parsed_text_length=total_length,
-        parse_type="pdf",
-    )
-    return ParsedText(content=pages, metadata=metadata)
-
-
 def chunk_pdf(
     parsed_text: ParsedText, doc: Doc, chunk_chars: int, overlap: int
 ) -> list[Text]:
diff --git a/src/paperqa/settings.py b/src/paperqa/settings.py
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
diff --git a/uv.lock b/uv.lock