-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Closed
Labels
is-regressionRegression introduced as a side-effect of another changeRegression introduced as a side-effect of another changeworkflow-text-extractionFrom a users perspective, text extraction is the affected feature/workflowFrom a users perspective, text extraction is the affected feature/workflow
Description
Our CI-pipelines run latest version of pypdf
in some of our example notebooks.
These fail with the following exception with version 5.1.0
Environment
Github runner - ubuntu-latest
Code + PDF
This is a minimal, complete example that shows the issue:
import requests
from pdf2image import convert_from_path
from pypdf import PdfReader
def download_pdf(url):
response = requests.get(url)
if response.status_code == 200:
return BytesIO(response.content)
else:
raise Exception(f"Failed to download PDF: Status code {response.status_code}")
def get_pdf_images(pdf_url):
# Download the PDF
pdf_file = download_pdf(pdf_url)
# Save the PDF temporarily to disk (pdf2image requires a file path)
temp_file = "temp.pdf"
with open(temp_file, "wb") as f:
f.write(pdf_file.read())
reader = PdfReader(temp_file)
page_texts = []
for page_number in range(len(reader.pages)):
page = reader.pages[page_number]
text = page.extract_text()
page_texts.append(text)
images = convert_from_path(temp_file)
assert len(images) == len(page_texts)
return (images, page_texts)
sample_pdfs = [
{
"title": "ConocoPhillips Sustainability Highlights - Nature (24-0976)",
"url": "https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf",
},
{
"title": "ConocoPhillips Managing Climate Related Risks",
"url": "https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf",
},
{
"title": "ConocoPhillips 2023 Sustainability Report",
"url": "https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf",
},
]
for pdf in sample_pdfs:
page_images, page_texts = get_pdf_images(pdf["url"])
pdf["images"] = page_images
pdf["texts"] = page_texts
Pdfs in links above, but also applies to several others.
Traceback
This is the complete traceback I see:
UnboundLocalError Traceback (most recent call last)
Cell In[8], line 2
1 for pdf in sample_pdfs:
----> 2 page_images, page_texts = get_pdf_images(pdf["url"])
3 pdf["images"] = page_images
4 pdf["texts"] = page_texts
Cell In[6], line 24, in get_pdf_images(pdf_url)
22 for page_number in range(len(reader.pages)):
23 page = reader.pages[page_number]
---> 24 text = page.extract_text()
25 page_texts.append(text)
26 images = convert_from_path("temp.pdf")
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_page.py:2393, in PageObject.extract_text(self, orientations, space_width, visitor_operand_before, visitor_operand_after, visitor_text, extraction_mode, *args, **kwargs)
2390 if isinstance(orientations, int):
2391 orientations = (orientations,)
-> 2393 return self._extract_text(
2394 self,
2395 self.pdf,
2396 orientations,
2397 space_width,
2398 PG.CONTENTS,
2399 visitor_operand_before,
2400 visitor_operand_after,
2401 visitor_text,
2402 )
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_page.py:1868, in PageObject._extract_text(self, obj, pdf, orientations, space_width, content_key, visitor_operand_before, visitor_operand_after, visitor_text)
1866 if "/Font" in resources_dict:
1867 for f in cast(DictionaryObject, resources_dict["/Font"]):
-> 1868 cmaps[f] = build_char_map(f, space_width, obj)
1869 cmap: Tuple[
1870 Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1871 ] = (
(...)
1875 None,
1876 ) # (encoding,CMAP,font resource name,dictionary-object of font)
1877 try:
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:33, in build_char_map(font_name, space_width, obj)
19 """
20 Determine information about a font.
21
(...)
30
31 """
32 ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
---> 33 font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
34 space_width, ft
35 )
36 return font_subtype, font_halfspace, font_encoding, font_map, ft
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:56, in build_char_map_from_dict(space_width, ft)
42 """
43 Determine information about a font.
44
(...)
53
54 """
55 font_type = cast(str, ft["/Subtype"].get_object())
---> 56 encoding, map_dict = get_encoding(ft)
58 space_key_char = get_actual_str_key(" ", encoding, map_dict)
59 font_width_map = build_font_width_map(ft, space_width * 2.0)
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:129, in get_encoding(ft)
125 def get_encoding(
126 ft: DictionaryObject
127 ) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
128 encoding = _parse_encoding(ft)
--> 129 map_dict, int_entry = _parse_to_unicode(ft)
131 # Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
132 # if cmap not empty encoding should be discarded
133 # (here transformed into identity for those characters)
134 # If encoding is a string it is expected to be an identity translation.
135 if isinstance(encoding, dict):
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:212, in _parse_to_unicode(ft)
210 if "/ToUnicode" not in ft:
211 if ft.get("/Subtype", "") == "/Type1":
--> 212 return _type1_alternative(ft, map_dict, int_entry)
213 else:
214 return {}, []
File ~/work/pyvespa/pyvespa/.venv/lib/python3.10/site-packages/pypdf/_cmap.py:530, in _type1_alternative(ft, map_dict, int_entry)
528 except ValueError: # pragma: no cover
529 continue
--> 530 map_dict[chr(i)] = v
531 int_entry.append(i)
532 return map_dict, int_entry
UnboundLocalError: local variable 'v' referenced before assignment
amotl
Metadata
Metadata
Assignees
Labels
is-regressionRegression introduced as a side-effect of another changeRegression introduced as a side-effect of another changeworkflow-text-extractionFrom a users perspective, text extraction is the affected feature/workflowFrom a users perspective, text extraction is the affected feature/workflow