Skip to content

Undefined variable in text extraction with version 5.1.0 #2925

@thomasht86

Description

@thomasht86

Our CI-pipelines run latest version of pypdf in some of our example notebooks.
These fail with the following exception with version 5.1.0

Environment

Github runner - ubuntu-latest

Code + PDF

This is a minimal, complete example that shows the issue:

import requests
from pdf2image import convert_from_path
from pypdf import PdfReader


def download_pdf(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        raise Exception(f"Failed to download PDF: Status code {response.status_code}")


def get_pdf_images(pdf_url):
    # Download the PDF
    pdf_file = download_pdf(pdf_url)
    # Save the PDF temporarily to disk (pdf2image requires a file path)
    temp_file = "temp.pdf"
    with open(temp_file, "wb") as f:
        f.write(pdf_file.read())
    reader = PdfReader(temp_file)
    page_texts = []
    for page_number in range(len(reader.pages)):
        page = reader.pages[page_number]
        text = page.extract_text()
        page_texts.append(text)
    images = convert_from_path(temp_file)
    assert len(images) == len(page_texts)
    return (images, page_texts)

sample_pdfs = [
    {
        "title": "ConocoPhillips Sustainability Highlights - Nature (24-0976)",
        "url": "https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf",
    },
    {
        "title": "ConocoPhillips Managing Climate Related Risks",
        "url": "https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf",
    },
    {
        "title": "ConocoPhillips 2023 Sustainability Report",
        "url": "https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf",
    },
]

for pdf in sample_pdfs:
    page_images, page_texts = get_pdf_images(pdf["url"])
    pdf["images"] = page_images
    pdf["texts"] = page_texts

Pdfs in links above, but also applies to several others.

Traceback

This is the complete traceback I see:

UnboundLocalError                         Traceback (most recent call last)
Cell In[8], line 2
      1 for pdf in sample_pdfs:
----> 2     page_images, page_texts = get_pdf_images(pdf["url"])
      3     pdf["images"] = page_images
      4     pdf["texts"] = page_texts

Cell In[6], line 24, in get_pdf_images(pdf_url)
     22 for page_number in range(len(reader.pages)):
     23     page = reader.pages[page_number]
---> 24     text = page.extract_text()
     25     page_texts.append(text)
     26 images = convert_from_path("temp.pdf")

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_page.py:2393, in PageObject.extract_text(self, orientations, space_width, visitor_operand_before, visitor_operand_after, visitor_text, extraction_mode, *args, **kwargs)
   2390 if isinstance(orientations, int):
   2391     orientations = (orientations,)
-> 2393 return self._extract_text(
   2394     self,
   2395     self.pdf,
   2396     orientations,
   2397     space_width,
   2398     PG.CONTENTS,
   2399     visitor_operand_before,
   2400     visitor_operand_after,
   2401     visitor_text,
   2402 )

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_page.py:1868, in PageObject._extract_text(self, obj, pdf, orientations, space_width, content_key, visitor_operand_before, visitor_operand_after, visitor_text)
   1866 if "/Font" in resources_dict:
   1867     for f in cast(DictionaryObject, resources_dict["/Font"]):
-> 1868         cmaps[f] = build_char_map(f, space_width, obj)
   1869 cmap: Tuple[
   1870     Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
   1871 ] = (
   (...)
   1875     None,
   1876 )  # (encoding,CMAP,font resource name,dictionary-object of font)
   1877 try:

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:33, in build_char_map(font_name, space_width, obj)
     19 """
     20 Determine information about a font.
     21 
   (...)
     30 
     31 """
     32 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
---> 33 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
     34     space_width, ft
     35 )
     36 return font_subtype, font_halfspace, font_encoding, font_map, ft

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:56, in build_char_map_from_dict(space_width, ft)
     42 """
     43 Determine information about a font.
     44 
   (...)
     53 
     54 """
     55 font_type = cast(str, ft["/Subtype"].get_object())
---> 56 encoding, map_dict = get_encoding(ft)
     58 space_key_char = get_actual_str_key(" ", encoding, map_dict)
     59 font_width_map = build_font_width_map(ft, space_width * 2.0)

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:129, in get_encoding(ft)
    125 def get_encoding(
    126     ft: DictionaryObject
    127 ) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
    128     encoding = _parse_encoding(ft)
--> 129     map_dict, int_entry = _parse_to_unicode(ft)
    131     # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
    132     #   if cmap not empty encoding should be discarded
    133     #   (here transformed into identity for those characters)
    134     # If encoding is a string it is expected to be an identity translation.
    135     if isinstance(encoding, dict):

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:212, in _parse_to_unicode(ft)
    210 if "/ToUnicode" not in ft:
    211     if ft.get("/Subtype", "") == "/Type1":
--> 212         return _type1_alternative(ft, map_dict, int_entry)
    213     else:
    214         return {}, []

File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:530, in _type1_alternative(ft, map_dict, int_entry)
    528                 except ValueError:  # pragma: no cover
    529                     continue
--> 530         map_dict[chr(i)] = v
    531         int_entry.append(i)
    532 return map_dict, int_entry

UnboundLocalError: local variable 'v' referenced before assignment

Metadata

Metadata

Assignees

No one assigned

    Labels

    is-regressionRegression introduced as a side-effect of another changeworkflow-text-extractionFrom a users perspective, text extraction is the affected feature/workflow

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions