Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: cope with loops in Fields tree #2656

Merged
merged 7 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 40 additions & 32 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,17 +492,19 @@
tree: Optional[TreeObject] = None,
retval: Optional[Dict[Any, Any]] = None,
fileobj: Optional[Any] = None,
stack: Optional[List[PdfObject]] = None,
) -> Optional[Dict[str, Any]]:
"""
Extract field data if this PDF contains interactive form fields.

The *tree* and *retval* parameters are for recursive use.
The *tree*, *retval*, *stack* parameters are for recursive use.

Args:
tree:
retval:
tree: current object parsed
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
retval: in progress list of fields
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
stack: list of object already parsed
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved

Returns:
A dictionary where each key is a field name, and each
Expand All @@ -515,26 +517,25 @@
if retval is None:
retval = {}
catalog = self.root_object
stack = []
# get the AcroForm tree
if CD.ACRO_FORM in catalog:
tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM])
else:
return None
if tree is None:
return retval
self._check_kids(tree, retval, fileobj)
for attr in field_attributes:
if attr in tree:
# Tree is a field
self._build_field(tree, retval, fileobj, field_attributes)
break

assert stack is not None
if "/Fields" in tree:
fields = cast(ArrayObject, tree["/Fields"])
for f in fields:
field = f.get_object()
self._build_field(field, retval, fileobj, field_attributes)

self._build_field(field, retval, fileobj, field_attributes, stack)
else:
self._check_kids(tree, retval, fileobj, stack)
if any(attr in tree for attr in field_attributes):
# Tree is a field
self._build_field(tree, retval, fileobj, field_attributes, stack)
return retval

def _get_qualified_field_name(self, parent: DictionaryObject) -> str:
Expand All @@ -557,25 +558,12 @@
retval: Dict[Any, Any],
fileobj: Any,
field_attributes: Any,
stack: List[PdfObject],
) -> None:
self._check_kids(field, retval, fileobj)
try:
key = cast(str, field["/TM"])
except KeyError:
try:
if "/Parent" in field:
key = (
self._get_qualified_field_name(
cast(DictionaryObject, field["/Parent"])
)
+ "."
)
else:
key = ""
key += cast(str, field["/T"])
except KeyError:
# Ignore no-name field for now
return
self._check_kids(field, retval, fileobj, stack)
if all(attr not in field for attr in ("/T", "/TM")):
return
key = self._get_qualified_field_name(field)
if fileobj:
self._write_field(fileobj, field, field_attributes)
fileobj.write("\n")
Expand Down Expand Up @@ -606,12 +594,32 @@
del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")]

def _check_kids(
self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any
self,
tree: Union[TreeObject, DictionaryObject],
retval: Any,
fileobj: Any,
stack: List[PdfObject],
) -> None:
if tree in stack:
logger_warning(
f"{self._get_qualified_field_name(tree)} already parsed", __name__
)
return
stack.append(tree)
if PA.KIDS in tree:
# recurse down the tree
for kid in tree[PA.KIDS]: # type: ignore
self.get_fields(kid.get_object(), retval, fileobj)
kid = kid.get_object()
if tree.indirect_reference != kid.get("/Parent", None):
logger_warning(

Check warning on line 614 in pypdf/_doc_common.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_doc_common.py#L614

Added line #L614 was not covered by tests
(
f'"/Parent" of {self._get_qualified_field_name(kid)} '
f"different from {self._get_qualified_field_name(tree)}"
),
__name__,
)
continue

Check warning on line 621 in pypdf/_doc_common.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_doc_common.py#L621

Added line #L621 was not covered by tests
self.get_fields(kid, retval, fileobj, stack)

def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None:
field_attributes_tuple = FA.attributes()
Expand Down
19 changes: 19 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1530,3 +1530,22 @@ def test_damaged_pdf():
assert (
exc.value.args[0] == "Expected object ID (21 0) does not match actual (-1 -1)."
)


@pytest.mark.enable_socket()
def test_looping_form():
"""Cf iss 2643"""
url = "https://github.com/py-pdf/pypdf/files/15306053/inheritance.pdf"
name = "iss2643.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=False)
flds = reader.get_fields()
assert all(
x in flds
for x in (
"Text10",
"Text10.0.0.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1",
"amt1.0",
"amt1.1",
"DSS#3pg3#0hgu7",
)
)
Loading