Skip to content

AttributeError: 'DictionaryObject' object has no attribute 'get_data' #2995

@neeraj9

Description

@neeraj9

Trying to extract text from one of the PDF led to an error in extracting text.

Additional info when error happened (see traceback later)

s.get_object() = {'/Filter': '/FlateDecode', '/Length': 629}
dir(s.get_object()) = ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__orig_bases__', '__parameters__', '__protocol_attrs__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_clone', '_is_protocol', '_is_runtime_protocol', '_reference_clone', 'clear', 'clone', 'copy', 'fromkeys', 'get', 'get_inherited', 'get_object', 'hash_bin', 'hash_func', 'hash_value', 'hash_value_data', 'indirect_reference', 'items', 'keys', 'pop', 'popitem', 'raw_get', 'read_from_stream', 'replicate', 'setdefault', 'update', 'values', 'write_to_stream', 'xmp_metadata']
type(s.get_object()) = <class 'pypdf.generic._data_structures.DictionaryObject'>

Added traces to pypdf/generic/_data_structures.py

if isinstance(stream, ArrayObject):
                data = b""
                for s in stream:
                    print(f"s.get_object() = {s.get_object()}")
                    print(f"dir(s.get_object()) = {dir(s.get_object())}")
                    print(f"type(s.get_object()) = {type(s.get_object())}")
                    data += s.get_object().get_data()
                    if len(data) == 0 or data[-1] != b"\n":
                        data += b"\n"
                super().set_data(bytes(data))
            else:
                stream_data = stream.get_data()
                assert stream_data is not None
                super().set_data(stream_data)

Environment

Which environment were you using when you encountered the problem?

$ python -m platform
Windows-11-10.0.22631-SP0

$ python -c "import pypdf;print(pypdf._debug_versions)"
pypdf==5.1.0, crypt_provider=('local_crypt_fallback', '0.0.0'), PIL=11.0.0

Code + PDF

This is a minimal, complete example that shows the issue:

for page_num in range_of_pages:
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()
        page_text = page_text.strip()
        if not page_text:
            page_num_without_text.append(page_num + 1)
        page_texts.append(page_text)

Share here the PDF file(s) that cause the issue. The smaller they are, the
better. Let us know if we may add them to our tests!

Traceback

This is the complete traceback I see:

  File "venv\Lib\site-packages\pypdf\_page.py", line 2397, in extract_text
    return self._extract_text(
           ^^^^^^^^^^^^^^^^^^^
  File "venv\Lib\site-packages\pypdf\_page.py", line 1882, in _extract_text
    content = ContentStream(content, pdf, "bytes")
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "venv\Lib\site-packages\pypdf\generic\_data_structures.py", line 1181, in __init__
    data += s.get_object().get_data()
            ^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DictionaryObject' object has no attribute 'get_data'

6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf

Workaround to get past issue

This is NOT a FIX, but a temporary workaround to get past the text extraction error.

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 5cbbc49..e5e9ee0 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2108,15 +2108,16 @@ class PageObject(DictionaryObject):
             elif operator == b"TJ":
                 # The space width may be smaller than the font width, so the width should be 95%.
                 _confirm_space_width = _space_width * 0.95
-                for op in operands[0]:
-                    if isinstance(op, (str, bytes)):
-                        process_operation(b"Tj", [op])
-                    if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op)) >= _confirm_space_width)
-                        and (len(text) > 0)
-                        and (text[-1] != " ")
-                    ):
-                        process_operation(b"Tj", [" "])
+                if len(operands) > 0:
+                    for op in operands[0]:
+                        if isinstance(op, (str, bytes)):
+                            process_operation(b"Tj", [op])
+                        if isinstance(op, (int, float, NumberObject, FloatObject)) and (
+                            (abs(float(op)) >= _confirm_space_width)
+                            and (len(text) > 0)
+                            and (text[-1] != " ")
+                        ):
+                            process_operation(b"Tj", [" "])
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index e525b87..08a2d84 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -1178,7 +1178,10 @@ class ContentStream(DecodedStreamObject):
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
-                    data += s.get_object().get_data()
+                    try:
+                        data += s.get_object().get_data()
+                    except AttributeError:
+                        pass
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
                 super().set_data(bytes(data))

Metadata

Metadata

Assignees

No one assigned

    Labels

    is-uncaught-exceptionUse this label only for issues caused by broken PDF documents that cannot be recovered.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions