AttributeError: 'DictionaryObject' object has no attribute 'get_data'

Trying to extract text from one of the PDF led to an error in extracting text.

Additional info when error happened (see traceback later)

```
s.get_object() = {'/Filter': '/FlateDecode', '/Length': 629}
dir(s.get_object()) = ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__orig_bases__', '__parameters__', '__protocol_attrs__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_clone', '_is_protocol', '_is_runtime_protocol', '_reference_clone', 'clear', 'clone', 'copy', 'fromkeys', 'get', 'get_inherited', 'get_object', 'hash_bin', 'hash_func', 'hash_value', 'hash_value_data', 'indirect_reference', 'items', 'keys', 'pop', 'popitem', 'raw_get', 'read_from_stream', 'replicate', 'setdefault', 'update', 'values', 'write_to_stream', 'xmp_metadata']
type(s.get_object()) = <class 'pypdf.generic._data_structures.DictionaryObject'>
```

Added traces to `pypdf/generic/_data_structures.py`

```python
if isinstance(stream, ArrayObject):
                data = b""
                for s in stream:
                    print(f"s.get_object() = {s.get_object()}")
                    print(f"dir(s.get_object()) = {dir(s.get_object())}")
                    print(f"type(s.get_object()) = {type(s.get_object())}")
                    data += s.get_object().get_data()
                    if len(data) == 0 or data[-1] != b"\n":
                        data += b"\n"
                super().set_data(bytes(data))
            else:
                stream_data = stream.get_data()
                assert stream_data is not None
                super().set_data(stream_data)
```

## Environment

Which environment were you using when you encountered the problem?

```bash
$ python -m platform
Windows-11-10.0.22631-SP0

$ python -c "import pypdf;print(pypdf._debug_versions)"
pypdf==5.1.0, crypt_provider=('local_crypt_fallback', '0.0.0'), PIL=11.0.0
```

## Code + PDF

This is a minimal, complete example that shows the issue:

```python
for page_num in range_of_pages:
        page = pdf_reader.pages[page_num]
        page_text = page.extract_text()
        page_text = page_text.strip()
        if not page_text:
            page_num_without_text.append(page_num + 1)
        page_texts.append(page_text)
```

Share here the PDF file(s) that cause the issue. The smaller they are, the
better. Let us know if we may add them to our tests!

## Traceback

This is the complete traceback I see:

```
  File "venv\Lib\site-packages\pypdf\_page.py", line 2397, in extract_text
    return self._extract_text(
           ^^^^^^^^^^^^^^^^^^^
  File "venv\Lib\site-packages\pypdf\_page.py", line 1882, in _extract_text
    content = ContentStream(content, pdf, "bytes")
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "venv\Lib\site-packages\pypdf\generic\_data_structures.py", line 1181, in __init__
    data += s.get_object().get_data()
            ^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DictionaryObject' object has no attribute 'get_data'
```


[6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf](https://github.com/user-attachments/files/18049322/6fa5fd46-5f98-4a67-800d-5e2362b0164f.pdf)


## Workaround to get past issue

> This is NOT a FIX, but a temporary workaround to get past the text extraction error.

```diff
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 5cbbc49..e5e9ee0 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2108,15 +2108,16 @@ class PageObject(DictionaryObject):
             elif operator == b"TJ":
                 # The space width may be smaller than the font width, so the width should be 95%.
                 _confirm_space_width = _space_width * 0.95
-                for op in operands[0]:
-                    if isinstance(op, (str, bytes)):
-                        process_operation(b"Tj", [op])
-                    if isinstance(op, (int, float, NumberObject, FloatObject)) and (
-                        (abs(float(op)) >= _confirm_space_width)
-                        and (len(text) > 0)
-                        and (text[-1] != " ")
-                    ):
-                        process_operation(b"Tj", [" "])
+                if len(operands) > 0:
+                    for op in operands[0]:
+                        if isinstance(op, (str, bytes)):
+                            process_operation(b"Tj", [op])
+                        if isinstance(op, (int, float, NumberObject, FloatObject)) and (
+                            (abs(float(op)) >= _confirm_space_width)
+                            and (len(text) > 0)
+                            and (text[-1] != " ")
+                        ):
+                            process_operation(b"Tj", [" "])
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index e525b87..08a2d84 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -1178,7 +1178,10 @@ class ContentStream(DecodedStreamObject):
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
-                    data += s.get_object().get_data()
+                    try:
+                        data += s.get_object().get_data()
+                    except AttributeError:
+                        pass
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
                 super().set_data(bytes(data))
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AttributeError: 'DictionaryObject' object has no attribute 'get_data' #2995

Environment

Code + PDF

Traceback

Workaround to get past issue

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

AttributeError: 'DictionaryObject' object has no attribute 'get_data' #2995

Description

Environment

Code + PDF

Traceback

Workaround to get past issue

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions