|  | 
|  | 1 | +import os | 
|  | 2 | + | 
|  | 3 | +import pymupdf | 
|  | 4 | +from paperqa.types import ParsedMetadata, ParsedText | 
|  | 5 | +from paperqa.utils import ImpossibleParsingError | 
|  | 6 | +from paperqa.version import __version__ as pqa_version | 
|  | 7 | + | 
|  | 8 | + | 
|  | 9 | +def setup_pymupdf_python_logging() -> None: | 
|  | 10 | +    """ | 
|  | 11 | +    Configure PyMuPDF to use Python logging. | 
|  | 12 | +
 | 
|  | 13 | +    SEE: https://pymupdf.readthedocs.io/en/latest/app3.html#diagnostics | 
|  | 14 | +    """ | 
|  | 15 | +    pymupdf.set_messages(pylogging=True) | 
|  | 16 | + | 
|  | 17 | + | 
|  | 18 | +BLOCK_TEXT_INDEX = 4 | 
|  | 19 | + | 
|  | 20 | + | 
|  | 21 | +def parse_pdf_to_pages( | 
|  | 22 | +    path: str | os.PathLike, | 
|  | 23 | +    page_size_limit: int | None = None, | 
|  | 24 | +    use_block_parsing: bool = False, | 
|  | 25 | +    **_, | 
|  | 26 | +) -> ParsedText: | 
|  | 27 | + | 
|  | 28 | +    with pymupdf.open(path) as file: | 
|  | 29 | +        pages: dict[str, str] = {} | 
|  | 30 | +        total_length = 0 | 
|  | 31 | + | 
|  | 32 | +        for i in range(file.page_count): | 
|  | 33 | +            try: | 
|  | 34 | +                page = file.load_page(i) | 
|  | 35 | +            except pymupdf.mupdf.FzErrorFormat as exc: | 
|  | 36 | +                raise ImpossibleParsingError( | 
|  | 37 | +                    f"Page loading via {pymupdf.__name__} failed on page {i} of" | 
|  | 38 | +                    f" {file.page_count} for the PDF at path {path}, likely this PDF" | 
|  | 39 | +                    " file is corrupt." | 
|  | 40 | +                ) from exc | 
|  | 41 | + | 
|  | 42 | +            if use_block_parsing: | 
|  | 43 | +                # NOTE: this block-based parsing appears to be better, but until | 
|  | 44 | +                # fully validated on 1+ benchmarks, it's considered experimental | 
|  | 45 | + | 
|  | 46 | +                # Extract text blocks from the page | 
|  | 47 | +                # Note: sort=False is important to preserve the order of text blocks | 
|  | 48 | +                # as they appear in the PDF | 
|  | 49 | +                blocks = page.get_text("blocks", sort=False) | 
|  | 50 | + | 
|  | 51 | +                # Concatenate text blocks into a single string | 
|  | 52 | +                text = "\n".join( | 
|  | 53 | +                    block[BLOCK_TEXT_INDEX] | 
|  | 54 | +                    for block in blocks | 
|  | 55 | +                    if len(block) > BLOCK_TEXT_INDEX | 
|  | 56 | +                ) | 
|  | 57 | +            else: | 
|  | 58 | +                text = page.get_text("text", sort=True) | 
|  | 59 | + | 
|  | 60 | +            if page_size_limit and len(text) > page_size_limit: | 
|  | 61 | +                raise ImpossibleParsingError( | 
|  | 62 | +                    f"The text in page {i} of {file.page_count} was {len(text)} chars" | 
|  | 63 | +                    f" long, which exceeds the {page_size_limit} char limit for the PDF" | 
|  | 64 | +                    f" at path {path}." | 
|  | 65 | +                ) | 
|  | 66 | +            pages[str(i + 1)] = text | 
|  | 67 | +            total_length += len(text) | 
|  | 68 | + | 
|  | 69 | +    metadata = ParsedMetadata( | 
|  | 70 | +        parsing_libraries=[f"pymupdf ({pymupdf.__version__})"], | 
|  | 71 | +        paperqa_version=pqa_version, | 
|  | 72 | +        total_parsed_text_length=total_length, | 
|  | 73 | +        parse_type="pdf", | 
|  | 74 | +    ) | 
|  | 75 | +    return ParsedText(content=pages, metadata=metadata) | 
0 commit comments