Skip to content

Handle duplicate widget names when joining PDFs #4347

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4562,6 +4562,7 @@ def insert_pdf(
links=1,
annots=1,
widgets=1,
join_duplicates=0,
show_progress=0,
final=1,
_gmap=None,
Expand All @@ -4577,6 +4578,7 @@ def insert_pdf(
links: (int/bool) whether to also copy links.
annots: (int/bool) whether to also copy annotations.
widgets: (int/bool) whether to also copy form fields.
join_duplicates: (int/bool) join or rename duplicate widget names.
show_progress: (int) progress message interval, 0 is no messages.
final: (bool) indicates last insertion from this source PDF.
_gmap: internal use only
Expand Down Expand Up @@ -4663,7 +4665,7 @@ def insert_pdf(
#log( 'insert_pdf(): calling self._do_links()')
self._do_links(docsrc, from_page=fp, to_page=tp, start_at=sa)
if widgets:
self._do_widgets(docsrc, _gmap, from_page=fp, to_page=tp, start_at=sa)
self._do_widgets(docsrc, _gmap, from_page=fp, to_page=tp, start_at=sa, join_duplicates=join_duplicates)
if final == 1:
self.Graftmaps[isrt] = None
#log( 'insert_pdf(): returning')
Expand Down
311 changes: 254 additions & 57 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1683,15 +1683,185 @@ def do_widgets(
from_page: int = -1,
to_page: int = -1,
start_at: int = -1,
join_duplicates=0,
) -> None:
"""Insert widgets contained in copied page range into destination PDF.
"""Insert widgets of copied page range into target PDF.

Parameter values **must** equal those of method insert_pdf(). Method
insert_pdf() which must have been previously executed.
Parameter values **must** equal those of method insert_pdf() which
must have been previously executed.
"""
if not src.is_form_pdf: # nothing to do: source PDF has no fields
return

def clean_kid_parents(acro_fields):
""" Make sure all kids have correct "Parent" pointers."""
for i in range(acro_fields.pdf_array_len()):
parent = acro_fields.pdf_array_get(i)
kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
for j in range(kids.pdf_array_len()):
kid = kids.pdf_array_get(j)
kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)

def join_widgets(pdf, acro_fields, xref1, xref2, name):
"""Called for each pair of widgets having the same name.

Args:
pdf: target MuPDF document
acro_fields: object Root/AcroForm/Fields
xref1, xref2: widget xrefs having same names
name: (str) the name

Result:
Defined or updated widget parent that points to both widgets.
"""

def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
"""Merge widget in xref2 into "Kids" list of widget xref1.

Args:
xref1, kids1: target widget and its "Kids" array.
xref2, kids2: source wwidget and its "Kids" array (may be empty).
"""
# make indirect objects from widgets
w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
# find source widget in "Fields" array
idx = acro_fields.pdf_array_find(w2_ind)
acro_fields.pdf_array_delete(idx)

if not kids2.pdf_is_array(): # source widget has no kids
widget = mupdf.pdf_load_object(pdf, xref2)

# delete name from widget and insert target as parent
widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)

# put in target Kids
kids1.pdf_array_push(w2_ind)
else: # copy source kids to target kids
for i in range(kids2.pdf_array_len()):
kid = kids2.pdf_array_get(i)
kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
kids1.pdf_array_push(kid_ind)

def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
"""Make new "Parent" for two widgets with same name.

Args:
xref1, w1: first widget
xref2, w2: second widget
name: field name

Result:
Both widgets have no "Kids". We create a new object with the
name and a "Kids" array containing the widgets.
Original widgets must be removed from AcroForm/Fields.
"""
# make new "Parent" object
new = mupdf.pdf_new_dict(pdf, 5)
new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
new_obj = mupdf.pdf_add_object(pdf, new)
new_obj_xref = new_obj.pdf_to_num()
new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)

# copy over some required source widget properties
ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)

aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)

# remove name field, insert "Parent" field in source widgets
w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)

# put source widgets in "kids" array
ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
kids.pdf_array_push(ind1)
kids.pdf_array_push(ind2)

# remove source widgets from "AcroForm/Fields"
idx = acro_fields.pdf_array_find(ind1)
acro_fields.pdf_array_delete(idx)
idx = acro_fields.pdf_array_find(ind2)
acro_fields.pdf_array_delete(idx)

acro_fields.pdf_array_push(new_ind)

w1 = mupdf.pdf_load_object(pdf, xref1)
w2 = mupdf.pdf_load_object(pdf, xref2)
kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))

# check which widget has a suitable "Kids" array
if kids1.pdf_is_array():
re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
elif kids2.pdf_is_array():
re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
else:
new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order

def get_kids(parent, kids_list):
"""Return xref list of leaf kids for a parent.

Call with an empty list.
"""
kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
if not kids.pdf_is_array():
return kids_list
for i in range(kids.pdf_array_len()):
kid = kids.pdf_array_get(i)
if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
kids_list = get_kids(kid, kids_list)
else:
kids_list.append(kid.pdf_to_num())
return kids_list

def kids_xrefs(widget):
"""Get the xref of top "Parent" and the list of leaf widgets."""
kids_list = []
parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
parent_xref = parent.pdf_to_num()
if parent_xref == 0:
return parent_xref, kids_list
kids_list = get_kids(parent, kids_list)
return parent_xref, kids_list

def deduplicate_names(pdf, acro_fields, join_duplicates=False):
"""Handle any widget name duplicates caused by the merge."""
names = {} # key is a widget name, value a list of widgets having it.

# extract all names and widgets in "AcroForm/Fields"
for i in range(mupdf.pdf_array_len(acro_fields)):
wobject = mupdf.pdf_array_get(acro_fields, i)
xref = wobject.pdf_to_num()

# extract widget name and collect widget(s) using it
T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
xrefs = names.get(T, [])
xrefs.append(xref)
names[T] = xrefs

for name, xrefs in names.items():
if len(xrefs) < 2:
continue
xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
if join_duplicates: # combine fields with equal names
join_widgets(pdf, acro_fields, xref0, xref1, name)
else: # make field names unique
newname = name + f" [{xref1}]" # append this to the name
wobject = mupdf.pdf_load_object(pdf, xref1)
wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)

clean_kid_parents(acro_fields)

def get_acroform(doc):
"""Retrieve the AcroForm dictionary form a PDF."""
pdf = mupdf.pdf_document_from_fz_document(doc)
Expand All @@ -1702,56 +1872,79 @@ def get_acroform(doc):
srcpdf = mupdf.pdf_document_from_fz_document(src)

if tar.is_form_pdf:
# target is a Form PDF, so use its AcroForm to include source fields
# target is a Form PDF, so use it to include source fields
acro = get_acroform(tar)
# Important arrays of indirect objects
tar_fields = mupdf.pdf_dict_get(acro, pymupdf.PDF_NAME("Fields"))
tar_co = mupdf.pdf_dict_get(acro, pymupdf.PDF_NAME("CO"))
if not mupdf.pdf_is_array(tar_co):
tar_co = mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("CO"), 5)
# Important arrays in AcroForm
acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
if not tar_co.pdf_is_array():
tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
else:
# target is no Form PDF, so copy over source AcroForm
acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy

# Clear "Fields" and "CO" arrays: will be populated by page fields.
# This is required to avoid copying unneeded objects.
mupdf.pdf_dict_del(acro, pymupdf.PDF_NAME("Fields"))
mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("Fields"), 5)
mupdf.pdf_dict_del(acro, pymupdf.PDF_NAME("CO"))
mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("CO"), 5)
acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)

# Enrich AcroForm for copying to target
acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)

# Insert AcroForm into target PDF
acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
tar_fields = mupdf.pdf_dict_get(acro_tar, pymupdf.PDF_NAME("Fields"))
tar_co = mupdf.pdf_dict_get(acro_tar, pymupdf.PDF_NAME("CO"))
acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))

# get its xref and insert it into target catalog
tar_xref = mupdf.pdf_to_num(acro_tar)
tar_xref = acro_tar.pdf_to_num()
acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
mupdf.pdf_dict_put(root, pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)

if from_page <= to_page:
src_range = range(from_page, to_page + 1)
else:
src_range = range(from_page, to_page - 1, -1)

for i in range(len(src_range)):
# read first page that was copied over
tar_page = tar[start_at + i]

# convert it to a formal PDF page
tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
parents = {} # information about widget parents

# extract its annotations array
tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
if not mupdf.pdf_is_array(tar_annots):
tar_annots = mupdf.pdf_dict_put_array(
tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
)
# remove "P" owning page reference from all widgets of all source pages
for i in src_range:
src_page = src[src_range[i]]
for xref in [
xref
for xref, wtype, _ in src_page.annot_xrefs()
if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
]:
w_obj = mupdf.pdf_load_object(srcpdf, xref)
w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))

# get the widget's parent structure
parent_xref, old_kids = kids_xrefs(w_obj)
if parent_xref:
parents[parent_xref] = {
"new_xref": 0,
"old_kids": old_kids,
"new_kids": [],
}
# Copy over Parent widgets first - they are not page-dependent
for xref in parents.keys(): # pylint: disable=consider-using-dict-items
parent = mupdf.pdf_load_object(srcpdf, xref)
parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
kids_xrefs_new = get_kids(parent_tar, [])
parent_xref_new = parent_tar.pdf_to_num()
parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
acro_fields.pdf_array_push(parent_ind)
parents[xref]["new_xref"] = parent_xref_new
parents[xref]["new_kids"] = kids_xrefs_new

for i in src_range:
# read first copied over page in target
tar_page = tar[start_at + i]

# read the original page in the source PDF
src_page = src[src_range[i]]
Expand All @@ -1762,44 +1955,48 @@ def get_acroform(doc):
for xref, wtype, _ in src_page.annot_xrefs()
if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
]
if not w_xrefs: # no widgets on this source page
continue

# Remove page references from widgets to prevent duplicate copies
# of the page in the target.
for xref in w_xrefs:
w_obj = mupdf.pdf_load_object(srcpdf, xref)
mupdf.pdf_dict_del(w_obj, pymupdf.PDF_NAME("P"))
# convert to formal PDF page
tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)

# extract annotations array
tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
if not mupdf.pdf_is_array(tar_annots):
tar_annots = mupdf.pdf_dict_put_array(
tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
)

for xref in w_xrefs:
w_obj = mupdf.pdf_load_object(srcpdf, xref)

# check if field is a member of inter-field validations
temp = mupdf.pdf_dict_getp(w_obj, "AA/C")
if mupdf.pdf_is_dict(temp):
is_aac = True
else:
is_aac = False

# recursively complete the widget object with all referenced objects
w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)

# add the completed widget object to the target PDF
w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)

# extract its generated target xref number
tar_xref = mupdf.pdf_to_num(w_obj_tar)
# check if field takes part in inter-field validations
is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))

# create an indirect object from it
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)

# insert this xref reference into the page,
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
# check if parent of widget already in target
parent_xref = mupdf.pdf_to_num(
w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
)
if parent_xref == 0: # parent not in target yet
w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
tar_xref = w_obj_tar.pdf_to_num()
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
else:
parent = parents[parent_xref]
idx = parent["old_kids"].index(xref) # search for xref in parent
tar_xref = parent["new_kids"][idx]
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)

# and also into "AcroForm/Fields",
mupdf.pdf_array_push(tar_fields, w_obj_tar_ind)
# and also into "AcroForm/CO" if a computation field.
# Into "AcroForm/CO" if a computation field.
if is_aac:
mupdf.pdf_array_push(tar_co, w_obj_tar_ind)

deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)

def do_links(
doc1: pymupdf.Document,
Expand Down
Binary file added tests/resources/merge-form1.pdf
Binary file not shown.
Binary file added tests/resources/merge-form2.pdf
Binary file not shown.
Loading