Skip to content

Commit 261415e

Browse files
committed
Handle duplicate widget names when joining PDFs
Previously, in some constellations duplicated fields on source pages (i.e. having a common parent) were not correctly copied over to the target PDF. In addition, when source widgets happened to have same names as present in the target, invalid widgets resulted in the merged PDF. This fix copies all root widgets to the target in a separate first step. Only any root fields in source pages are separately copied over. Finally, name duplicates are resolved by either renaming source widgets, or (option) joining them with target widgets in common `/Kids` arrays.
1 parent fcb81eb commit 261415e

File tree

5 files changed

+341
-65
lines changed

5 files changed

+341
-65
lines changed

src/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -4562,6 +4562,7 @@ def insert_pdf(
45624562
links=1,
45634563
annots=1,
45644564
widgets=1,
4565+
join_duplicates=0,
45654566
show_progress=0,
45664567
final=1,
45674568
_gmap=None,
@@ -4577,6 +4578,7 @@ def insert_pdf(
45774578
links: (int/bool) whether to also copy links.
45784579
annots: (int/bool) whether to also copy annotations.
45794580
widgets: (int/bool) whether to also copy form fields.
4581+
join_duplicates: (int/bool) join or rename duplicate widget names.
45804582
show_progress: (int) progress message interval, 0 is no messages.
45814583
final: (bool) indicates last insertion from this source PDF.
45824584
_gmap: internal use only
@@ -4663,7 +4665,7 @@ def insert_pdf(
46634665
#log( 'insert_pdf(): calling self._do_links()')
46644666
self._do_links(docsrc, from_page=fp, to_page=tp, start_at=sa)
46654667
if widgets:
4666-
self._do_widgets(docsrc, _gmap, from_page=fp, to_page=tp, start_at=sa)
4668+
self._do_widgets(docsrc, _gmap, from_page=fp, to_page=tp, start_at=sa, join_duplicates=join_duplicates)
46674669
if final == 1:
46684670
self.Graftmaps[isrt] = None
46694671
#log( 'insert_pdf(): returning')

src/utils.py

+254-57
Original file line numberDiff line numberDiff line change
@@ -1683,15 +1683,185 @@ def do_widgets(
16831683
from_page: int = -1,
16841684
to_page: int = -1,
16851685
start_at: int = -1,
1686+
join_duplicates=0,
16861687
) -> None:
1687-
"""Insert widgets contained in copied page range into destination PDF.
1688+
"""Insert widgets of copied page range into target PDF.
16881689
1689-
Parameter values **must** equal those of method insert_pdf(). Method
1690-
insert_pdf() which must have been previously executed.
1690+
Parameter values **must** equal those of method insert_pdf() which
1691+
must have been previously executed.
16911692
"""
16921693
if not src.is_form_pdf: # nothing to do: source PDF has no fields
16931694
return
16941695

1696+
def clean_kid_parents(acro_fields):
1697+
""" Make sure all kids have correct "Parent" pointers."""
1698+
for i in range(acro_fields.pdf_array_len()):
1699+
parent = acro_fields.pdf_array_get(i)
1700+
kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1701+
for j in range(kids.pdf_array_len()):
1702+
kid = kids.pdf_array_get(j)
1703+
kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)
1704+
1705+
def join_widgets(pdf, acro_fields, xref1, xref2, name):
1706+
"""Called for each pair of widgets having the same name.
1707+
1708+
Args:
1709+
pdf: target MuPDF document
1710+
acro_fields: object Root/AcroForm/Fields
1711+
xref1, xref2: widget xrefs having same names
1712+
name: (str) the name
1713+
1714+
Result:
1715+
Defined or updated widget parent that points to both widgets.
1716+
"""
1717+
1718+
def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
1719+
"""Merge widget in xref2 into "Kids" list of widget xref1.
1720+
1721+
Args:
1722+
xref1, kids1: target widget and its "Kids" array.
1723+
xref2, kids2: source wwidget and its "Kids" array (may be empty).
1724+
"""
1725+
# make indirect objects from widgets
1726+
w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
1727+
w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
1728+
# find source widget in "Fields" array
1729+
idx = acro_fields.pdf_array_find(w2_ind)
1730+
acro_fields.pdf_array_delete(idx)
1731+
1732+
if not kids2.pdf_is_array(): # source widget has no kids
1733+
widget = mupdf.pdf_load_object(pdf, xref2)
1734+
1735+
# delete name from widget and insert target as parent
1736+
widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
1737+
widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
1738+
1739+
# put in target Kids
1740+
kids1.pdf_array_push(w2_ind)
1741+
else: # copy source kids to target kids
1742+
for i in range(kids2.pdf_array_len()):
1743+
kid = kids2.pdf_array_get(i)
1744+
kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
1745+
kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
1746+
kids1.pdf_array_push(kid_ind)
1747+
1748+
def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
1749+
"""Make new "Parent" for two widgets with same name.
1750+
1751+
Args:
1752+
xref1, w1: first widget
1753+
xref2, w2: second widget
1754+
name: field name
1755+
1756+
Result:
1757+
Both widgets have no "Kids". We create a new object with the
1758+
name and a "Kids" array containing the widgets.
1759+
Original widgets must be removed from AcroForm/Fields.
1760+
"""
1761+
# make new "Parent" object
1762+
new = mupdf.pdf_new_dict(pdf, 5)
1763+
new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
1764+
kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
1765+
new_obj = mupdf.pdf_add_object(pdf, new)
1766+
new_obj_xref = new_obj.pdf_to_num()
1767+
new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
1768+
1769+
# copy over some required source widget properties
1770+
ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
1771+
w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
1772+
new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)
1773+
1774+
aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
1775+
w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
1776+
new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)
1777+
1778+
# remove name field, insert "Parent" field in source widgets
1779+
w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
1780+
w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
1781+
w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
1782+
w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
1783+
1784+
# put source widgets in "kids" array
1785+
ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
1786+
ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
1787+
kids.pdf_array_push(ind1)
1788+
kids.pdf_array_push(ind2)
1789+
1790+
# remove source widgets from "AcroForm/Fields"
1791+
idx = acro_fields.pdf_array_find(ind1)
1792+
acro_fields.pdf_array_delete(idx)
1793+
idx = acro_fields.pdf_array_find(ind2)
1794+
acro_fields.pdf_array_delete(idx)
1795+
1796+
acro_fields.pdf_array_push(new_ind)
1797+
1798+
w1 = mupdf.pdf_load_object(pdf, xref1)
1799+
w2 = mupdf.pdf_load_object(pdf, xref2)
1800+
kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1801+
kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1802+
1803+
# check which widget has a suitable "Kids" array
1804+
if kids1.pdf_is_array():
1805+
re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
1806+
elif kids2.pdf_is_array():
1807+
re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
1808+
else:
1809+
new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order
1810+
1811+
def get_kids(parent, kids_list):
1812+
"""Return xref list of leaf kids for a parent.
1813+
1814+
Call with an empty list.
1815+
"""
1816+
kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
1817+
if not kids.pdf_is_array():
1818+
return kids_list
1819+
for i in range(kids.pdf_array_len()):
1820+
kid = kids.pdf_array_get(i)
1821+
if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
1822+
kids_list = get_kids(kid, kids_list)
1823+
else:
1824+
kids_list.append(kid.pdf_to_num())
1825+
return kids_list
1826+
1827+
def kids_xrefs(widget):
1828+
"""Get the xref of top "Parent" and the list of leaf widgets."""
1829+
kids_list = []
1830+
parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
1831+
parent_xref = parent.pdf_to_num()
1832+
if parent_xref == 0:
1833+
return parent_xref, kids_list
1834+
kids_list = get_kids(parent, kids_list)
1835+
return parent_xref, kids_list
1836+
1837+
def deduplicate_names(pdf, acro_fields, join_duplicates=False):
1838+
"""Handle any widget name duplicates caused by the merge."""
1839+
names = {} # key is a widget name, value a list of widgets having it.
1840+
1841+
# extract all names and widgets in "AcroForm/Fields"
1842+
for i in range(mupdf.pdf_array_len(acro_fields)):
1843+
wobject = mupdf.pdf_array_get(acro_fields, i)
1844+
xref = wobject.pdf_to_num()
1845+
1846+
# extract widget name and collect widget(s) using it
1847+
T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
1848+
xrefs = names.get(T, [])
1849+
xrefs.append(xref)
1850+
names[T] = xrefs
1851+
1852+
for name, xrefs in names.items():
1853+
if len(xrefs) < 2:
1854+
continue
1855+
xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
1856+
if join_duplicates: # combine fields with equal names
1857+
join_widgets(pdf, acro_fields, xref0, xref1, name)
1858+
else: # make field names unique
1859+
newname = name + f" [{xref1}]" # append this to the name
1860+
wobject = mupdf.pdf_load_object(pdf, xref1)
1861+
wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)
1862+
1863+
clean_kid_parents(acro_fields)
1864+
16951865
def get_acroform(doc):
16961866
"""Retrieve the AcroForm dictionary form a PDF."""
16971867
pdf = mupdf.pdf_document_from_fz_document(doc)
@@ -1702,56 +1872,79 @@ def get_acroform(doc):
17021872
srcpdf = mupdf.pdf_document_from_fz_document(src)
17031873

17041874
if tar.is_form_pdf:
1705-
# target is a Form PDF, so use its AcroForm to include source fields
1875+
# target is a Form PDF, so use it to include source fields
17061876
acro = get_acroform(tar)
1707-
# Important arrays of indirect objects
1708-
tar_fields = mupdf.pdf_dict_get(acro, pymupdf.PDF_NAME("Fields"))
1709-
tar_co = mupdf.pdf_dict_get(acro, pymupdf.PDF_NAME("CO"))
1710-
if not mupdf.pdf_is_array(tar_co):
1711-
tar_co = mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("CO"), 5)
1877+
# Important arrays in AcroForm
1878+
acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
1879+
tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
1880+
if not tar_co.pdf_is_array():
1881+
tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
17121882
else:
17131883
# target is no Form PDF, so copy over source AcroForm
17141884
acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy
17151885

17161886
# Clear "Fields" and "CO" arrays: will be populated by page fields.
17171887
# This is required to avoid copying unneeded objects.
1718-
mupdf.pdf_dict_del(acro, pymupdf.PDF_NAME("Fields"))
1719-
mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("Fields"), 5)
1720-
mupdf.pdf_dict_del(acro, pymupdf.PDF_NAME("CO"))
1721-
mupdf.pdf_dict_put_array(acro, pymupdf.PDF_NAME("CO"), 5)
1888+
acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
1889+
acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
1890+
acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
1891+
acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
17221892

17231893
# Enrich AcroForm for copying to target
17241894
acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
17251895

17261896
# Insert AcroForm into target PDF
17271897
acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
1728-
tar_fields = mupdf.pdf_dict_get(acro_tar, pymupdf.PDF_NAME("Fields"))
1729-
tar_co = mupdf.pdf_dict_get(acro_tar, pymupdf.PDF_NAME("CO"))
1898+
acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
1899+
tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))
17301900

17311901
# get its xref and insert it into target catalog
1732-
tar_xref = mupdf.pdf_to_num(acro_tar)
1902+
tar_xref = acro_tar.pdf_to_num()
17331903
acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
17341904
root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
1735-
mupdf.pdf_dict_put(root, pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
1905+
root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
17361906

17371907
if from_page <= to_page:
17381908
src_range = range(from_page, to_page + 1)
17391909
else:
17401910
src_range = range(from_page, to_page - 1, -1)
17411911

1742-
for i in range(len(src_range)):
1743-
# read first page that was copied over
1744-
tar_page = tar[start_at + i]
1745-
1746-
# convert it to a formal PDF page
1747-
tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
1912+
parents = {} # information about widget parents
17481913

1749-
# extract its annotations array
1750-
tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
1751-
if not mupdf.pdf_is_array(tar_annots):
1752-
tar_annots = mupdf.pdf_dict_put_array(
1753-
tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
1754-
)
1914+
# remove "P" owning page reference from all widgets of all source pages
1915+
for i in src_range:
1916+
src_page = src[src_range[i]]
1917+
for xref in [
1918+
xref
1919+
for xref, wtype, _ in src_page.annot_xrefs()
1920+
if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
1921+
]:
1922+
w_obj = mupdf.pdf_load_object(srcpdf, xref)
1923+
w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))
1924+
1925+
# get the widget's parent structure
1926+
parent_xref, old_kids = kids_xrefs(w_obj)
1927+
if parent_xref:
1928+
parents[parent_xref] = {
1929+
"new_xref": 0,
1930+
"old_kids": old_kids,
1931+
"new_kids": [],
1932+
}
1933+
# Copy over Parent widgets first - they are not page-dependent
1934+
for xref in parents.keys(): # pylint: disable=consider-using-dict-items
1935+
parent = mupdf.pdf_load_object(srcpdf, xref)
1936+
parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
1937+
parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
1938+
kids_xrefs_new = get_kids(parent_tar, [])
1939+
parent_xref_new = parent_tar.pdf_to_num()
1940+
parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
1941+
acro_fields.pdf_array_push(parent_ind)
1942+
parents[xref]["new_xref"] = parent_xref_new
1943+
parents[xref]["new_kids"] = kids_xrefs_new
1944+
1945+
for i in src_range:
1946+
# read first copied over page in target
1947+
tar_page = tar[start_at + i]
17551948

17561949
# read the original page in the source PDF
17571950
src_page = src[src_range[i]]
@@ -1762,44 +1955,48 @@ def get_acroform(doc):
17621955
for xref, wtype, _ in src_page.annot_xrefs()
17631956
if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
17641957
]
1958+
if not w_xrefs: # no widgets on this source page
1959+
continue
17651960

1766-
# Remove page references from widgets to prevent duplicate copies
1767-
# of the page in the target.
1768-
for xref in w_xrefs:
1769-
w_obj = mupdf.pdf_load_object(srcpdf, xref)
1770-
mupdf.pdf_dict_del(w_obj, pymupdf.PDF_NAME("P"))
1961+
# convert to formal PDF page
1962+
tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
1963+
1964+
# extract annotations array
1965+
tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
1966+
if not mupdf.pdf_is_array(tar_annots):
1967+
tar_annots = mupdf.pdf_dict_put_array(
1968+
tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
1969+
)
17711970

17721971
for xref in w_xrefs:
17731972
w_obj = mupdf.pdf_load_object(srcpdf, xref)
17741973

1775-
# check if field is a member of inter-field validations
1776-
temp = mupdf.pdf_dict_getp(w_obj, "AA/C")
1777-
if mupdf.pdf_is_dict(temp):
1778-
is_aac = True
1779-
else:
1780-
is_aac = False
1781-
1782-
# recursively complete the widget object with all referenced objects
1783-
w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
1784-
1785-
# add the completed widget object to the target PDF
1786-
w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
1787-
1788-
# extract its generated target xref number
1789-
tar_xref = mupdf.pdf_to_num(w_obj_tar)
1974+
# check if field takes part in inter-field validations
1975+
is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
17901976

1791-
# create an indirect object from it
1792-
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
1793-
1794-
# insert this xref reference into the page,
1795-
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
1977+
# check if parent of widget already in target
1978+
parent_xref = mupdf.pdf_to_num(
1979+
w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
1980+
)
1981+
if parent_xref == 0: # parent not in target yet
1982+
w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
1983+
w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
1984+
tar_xref = w_obj_tar.pdf_to_num()
1985+
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
1986+
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
1987+
mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
1988+
else:
1989+
parent = parents[parent_xref]
1990+
idx = parent["old_kids"].index(xref) # search for xref in parent
1991+
tar_xref = parent["new_kids"][idx]
1992+
w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
1993+
mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
17961994

1797-
# and also into "AcroForm/Fields",
1798-
mupdf.pdf_array_push(tar_fields, w_obj_tar_ind)
1799-
# and also into "AcroForm/CO" if a computation field.
1995+
# Into "AcroForm/CO" if a computation field.
18001996
if is_aac:
18011997
mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
18021998

1999+
deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
18032000

18042001
def do_links(
18052002
doc1: pymupdf.Document,

tests/resources/merge-form1.pdf

6.11 KB
Binary file not shown.

tests/resources/merge-form2.pdf

6.18 KB
Binary file not shown.

0 commit comments

Comments
 (0)