-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Closed
Labels
is-uncaught-exceptionUse this label only for issues caused by broken PDF documents that cannot be recovered.Use this label only for issues caused by broken PDF documents that cannot be recovered.
Description
Trying to extract text from one of the PDF led to an error in extracting text.
Additional info when error happened (see traceback later)
s.get_object() = {'/Filter': '/FlateDecode', '/Length': 629}
dir(s.get_object()) = ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__orig_bases__', '__parameters__', '__protocol_attrs__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_clone', '_is_protocol', '_is_runtime_protocol', '_reference_clone', 'clear', 'clone', 'copy', 'fromkeys', 'get', 'get_inherited', 'get_object', 'hash_bin', 'hash_func', 'hash_value', 'hash_value_data', 'indirect_reference', 'items', 'keys', 'pop', 'popitem', 'raw_get', 'read_from_stream', 'replicate', 'setdefault', 'update', 'values', 'write_to_stream', 'xmp_metadata']
type(s.get_object()) = <class 'pypdf.generic._data_structures.DictionaryObject'>
Added traces to pypdf/generic/_data_structures.py
if isinstance(stream, ArrayObject):
data = b""
for s in stream:
print(f"s.get_object() = {s.get_object()}")
print(f"dir(s.get_object()) = {dir(s.get_object())}")
print(f"type(s.get_object()) = {type(s.get_object())}")
data += s.get_object().get_data()
if len(data) == 0 or data[-1] != b"\n":
data += b"\n"
super().set_data(bytes(data))
else:
stream_data = stream.get_data()
assert stream_data is not None
super().set_data(stream_data)
Environment
Which environment were you using when you encountered the problem?
$ python -m platform
Windows-11-10.0.22631-SP0
$ python -c "import pypdf;print(pypdf._debug_versions)"
pypdf==5.1.0, crypt_provider=('local_crypt_fallback', '0.0.0'), PIL=11.0.0
Code + PDF
This is a minimal, complete example that shows the issue:
for page_num in range_of_pages:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
page_text = page_text.strip()
if not page_text:
page_num_without_text.append(page_num + 1)
page_texts.append(page_text)
Share here the PDF file(s) that cause the issue. The smaller they are, the
better. Let us know if we may add them to our tests!
Traceback
This is the complete traceback I see:
File "venv\Lib\site-packages\pypdf\_page.py", line 2397, in extract_text
return self._extract_text(
^^^^^^^^^^^^^^^^^^^
File "venv\Lib\site-packages\pypdf\_page.py", line 1882, in _extract_text
content = ContentStream(content, pdf, "bytes")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "venv\Lib\site-packages\pypdf\generic\_data_structures.py", line 1181, in __init__
data += s.get_object().get_data()
^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DictionaryObject' object has no attribute 'get_data'
6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf
Workaround to get past issue
This is NOT a FIX, but a temporary workaround to get past the text extraction error.
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 5cbbc49..e5e9ee0 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2108,15 +2108,16 @@ class PageObject(DictionaryObject):
elif operator == b"TJ":
# The space width may be smaller than the font width, so the width should be 95%.
_confirm_space_width = _space_width * 0.95
- for op in operands[0]:
- if isinstance(op, (str, bytes)):
- process_operation(b"Tj", [op])
- if isinstance(op, (int, float, NumberObject, FloatObject)) and (
- (abs(float(op)) >= _confirm_space_width)
- and (len(text) > 0)
- and (text[-1] != " ")
- ):
- process_operation(b"Tj", [" "])
+ if len(operands) > 0:
+ for op in operands[0]:
+ if isinstance(op, (str, bytes)):
+ process_operation(b"Tj", [op])
+ if isinstance(op, (int, float, NumberObject, FloatObject)) and (
+ (abs(float(op)) >= _confirm_space_width)
+ and (len(text) > 0)
+ and (text[-1] != " ")
+ ):
+ process_operation(b"Tj", [" "])
elif operator == b"Do":
output += text
if visitor_text is not None:
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index e525b87..08a2d84 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -1178,7 +1178,10 @@ class ContentStream(DecodedStreamObject):
if isinstance(stream, ArrayObject):
data = b""
for s in stream:
- data += s.get_object().get_data()
+ try:
+ data += s.get_object().get_data()
+ except AttributeError:
+ pass
if len(data) == 0 or data[-1] != b"\n":
data += b"\n"
super().set_data(bytes(data))
Metadata
Metadata
Assignees
Labels
is-uncaught-exceptionUse this label only for issues caused by broken PDF documents that cannot be recovered.Use this label only for issues caused by broken PDF documents that cannot be recovered.