@@ -1683,15 +1683,185 @@ def do_widgets(
1683
1683
from_page : int = - 1 ,
1684
1684
to_page : int = - 1 ,
1685
1685
start_at : int = - 1 ,
1686
+ join_duplicates = 0 ,
1686
1687
) -> None :
1687
- """Insert widgets contained in copied page range into destination PDF.
1688
+ """Insert widgets of copied page range into target PDF.
1688
1689
1689
- Parameter values **must** equal those of method insert_pdf(). Method
1690
- insert_pdf() which must have been previously executed.
1690
+ Parameter values **must** equal those of method insert_pdf() which
1691
+ must have been previously executed.
1691
1692
"""
1692
1693
if not src .is_form_pdf : # nothing to do: source PDF has no fields
1693
1694
return
1694
1695
1696
+ def clean_kid_parents (acro_fields ):
1697
+ """ Make sure all kids have correct "Parent" pointers."""
1698
+ for i in range (acro_fields .pdf_array_len ()):
1699
+ parent = acro_fields .pdf_array_get (i )
1700
+ kids = parent .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1701
+ for j in range (kids .pdf_array_len ()):
1702
+ kid = kids .pdf_array_get (j )
1703
+ kid .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), parent )
1704
+
1705
+ def join_widgets (pdf , acro_fields , xref1 , xref2 , name ):
1706
+ """Called for each pair of widgets having the same name.
1707
+
1708
+ Args:
1709
+ pdf: target MuPDF document
1710
+ acro_fields: object Root/AcroForm/Fields
1711
+ xref1, xref2: widget xrefs having same names
1712
+ name: (str) the name
1713
+
1714
+ Result:
1715
+ Defined or updated widget parent that points to both widgets.
1716
+ """
1717
+
1718
+ def re_target (pdf , acro_fields , xref1 , kids1 , xref2 , kids2 ):
1719
+ """Merge widget in xref2 into "Kids" list of widget xref1.
1720
+
1721
+ Args:
1722
+ xref1, kids1: target widget and its "Kids" array.
1723
+ xref2, kids2: source wwidget and its "Kids" array (may be empty).
1724
+ """
1725
+ # make indirect objects from widgets
1726
+ w1_ind = mupdf .pdf_new_indirect (pdf , xref1 , 0 )
1727
+ w2_ind = mupdf .pdf_new_indirect (pdf , xref2 , 0 )
1728
+ # find source widget in "Fields" array
1729
+ idx = acro_fields .pdf_array_find (w2_ind )
1730
+ acro_fields .pdf_array_delete (idx )
1731
+
1732
+ if not kids2 .pdf_is_array (): # source widget has no kids
1733
+ widget = mupdf .pdf_load_object (pdf , xref2 )
1734
+
1735
+ # delete name from widget and insert target as parent
1736
+ widget .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1737
+ widget .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), w1_ind )
1738
+
1739
+ # put in target Kids
1740
+ kids1 .pdf_array_push (w2_ind )
1741
+ else : # copy source kids to target kids
1742
+ for i in range (kids2 .pdf_array_len ()):
1743
+ kid = kids2 .pdf_array_get (i )
1744
+ kid .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), w1_ind )
1745
+ kid_ind = mupdf .pdf_new_indirect (pdf , kid .pdf_to_num (), 0 )
1746
+ kids1 .pdf_array_push (kid_ind )
1747
+
1748
+ def new_target (pdf , acro_fields , xref1 , w1 , xref2 , w2 , name ):
1749
+ """Make new "Parent" for two widgets with same name.
1750
+
1751
+ Args:
1752
+ xref1, w1: first widget
1753
+ xref2, w2: second widget
1754
+ name: field name
1755
+
1756
+ Result:
1757
+ Both widgets have no "Kids". We create a new object with the
1758
+ name and a "Kids" array containing the widgets.
1759
+ Original widgets must be removed from AcroForm/Fields.
1760
+ """
1761
+ # make new "Parent" object
1762
+ new = mupdf .pdf_new_dict (pdf , 5 )
1763
+ new .pdf_dict_put_text_string (pymupdf .PDF_NAME ("T" ), name )
1764
+ kids = new .pdf_dict_put_array (pymupdf .PDF_NAME ("Kids" ), 2 )
1765
+ new_obj = mupdf .pdf_add_object (pdf , new )
1766
+ new_obj_xref = new_obj .pdf_to_num ()
1767
+ new_ind = mupdf .pdf_new_indirect (pdf , new_obj_xref , 0 )
1768
+
1769
+ # copy over some required source widget properties
1770
+ ft = w1 .pdf_dict_get (pymupdf .PDF_NAME ("FT" ))
1771
+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("FT" ))
1772
+ new_obj .pdf_dict_put (pymupdf .PDF_NAME ("FT" ), ft )
1773
+
1774
+ aa = w1 .pdf_dict_get (pymupdf .PDF_NAME ("AA" ))
1775
+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("AA" ))
1776
+ new_obj .pdf_dict_put (pymupdf .PDF_NAME ("AA" ), aa )
1777
+
1778
+ # remove name field, insert "Parent" field in source widgets
1779
+ w1 .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1780
+ w1 .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), new_ind )
1781
+ w2 .pdf_dict_del (pymupdf .PDF_NAME ("T" ))
1782
+ w2 .pdf_dict_put (pymupdf .PDF_NAME ("Parent" ), new_ind )
1783
+
1784
+ # put source widgets in "kids" array
1785
+ ind1 = mupdf .pdf_new_indirect (pdf , xref1 , 0 )
1786
+ ind2 = mupdf .pdf_new_indirect (pdf , xref2 , 0 )
1787
+ kids .pdf_array_push (ind1 )
1788
+ kids .pdf_array_push (ind2 )
1789
+
1790
+ # remove source widgets from "AcroForm/Fields"
1791
+ idx = acro_fields .pdf_array_find (ind1 )
1792
+ acro_fields .pdf_array_delete (idx )
1793
+ idx = acro_fields .pdf_array_find (ind2 )
1794
+ acro_fields .pdf_array_delete (idx )
1795
+
1796
+ acro_fields .pdf_array_push (new_ind )
1797
+
1798
+ w1 = mupdf .pdf_load_object (pdf , xref1 )
1799
+ w2 = mupdf .pdf_load_object (pdf , xref2 )
1800
+ kids1 = w1 .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1801
+ kids2 = w2 .pdf_dict_get (pymupdf .PDF_NAME ("Kids" ))
1802
+
1803
+ # check which widget has a suitable "Kids" array
1804
+ if kids1 .pdf_is_array ():
1805
+ re_target (pdf , acro_fields , xref1 , kids1 , xref2 , kids2 ) # pylint: disable=arguments-out-of-order
1806
+ elif kids2 .pdf_is_array ():
1807
+ re_target (pdf , acro_fields , xref2 , kids2 , xref1 , kids1 ) # pylint: disable=arguments-out-of-order
1808
+ else :
1809
+ new_target (pdf , acro_fields , xref1 , w1 , xref2 , w2 , name ) # pylint: disable=arguments-out-of-order
1810
+
1811
+ def get_kids (parent , kids_list ):
1812
+ """Return xref list of leaf kids for a parent.
1813
+
1814
+ Call with an empty list.
1815
+ """
1816
+ kids = mupdf .pdf_dict_get (parent , pymupdf .PDF_NAME ("Kids" ))
1817
+ if not kids .pdf_is_array ():
1818
+ return kids_list
1819
+ for i in range (kids .pdf_array_len ()):
1820
+ kid = kids .pdf_array_get (i )
1821
+ if mupdf .pdf_is_dict (mupdf .pdf_dict_get (kid , pymupdf .PDF_NAME ("Kids" ))):
1822
+ kids_list = get_kids (kid , kids_list )
1823
+ else :
1824
+ kids_list .append (kid .pdf_to_num ())
1825
+ return kids_list
1826
+
1827
+ def kids_xrefs (widget ):
1828
+ """Get the xref of top "Parent" and the list of leaf widgets."""
1829
+ kids_list = []
1830
+ parent = mupdf .pdf_dict_get (widget , pymupdf .PDF_NAME ("Parent" ))
1831
+ parent_xref = parent .pdf_to_num ()
1832
+ if parent_xref == 0 :
1833
+ return parent_xref , kids_list
1834
+ kids_list = get_kids (parent , kids_list )
1835
+ return parent_xref , kids_list
1836
+
1837
+ def deduplicate_names (pdf , acro_fields , join_duplicates = False ):
1838
+ """Handle any widget name duplicates caused by the merge."""
1839
+ names = {} # key is a widget name, value a list of widgets having it.
1840
+
1841
+ # extract all names and widgets in "AcroForm/Fields"
1842
+ for i in range (mupdf .pdf_array_len (acro_fields )):
1843
+ wobject = mupdf .pdf_array_get (acro_fields , i )
1844
+ xref = wobject .pdf_to_num ()
1845
+
1846
+ # extract widget name and collect widget(s) using it
1847
+ T = mupdf .pdf_dict_get_text_string (wobject , pymupdf .PDF_NAME ("T" ))
1848
+ xrefs = names .get (T , [])
1849
+ xrefs .append (xref )
1850
+ names [T ] = xrefs
1851
+
1852
+ for name , xrefs in names .items ():
1853
+ if len (xrefs ) < 2 :
1854
+ continue
1855
+ xref0 , xref1 = xrefs [:2 ] # only exactly 2 should occur!
1856
+ if join_duplicates : # combine fields with equal names
1857
+ join_widgets (pdf , acro_fields , xref0 , xref1 , name )
1858
+ else : # make field names unique
1859
+ newname = name + f" [{ xref1 } ]" # append this to the name
1860
+ wobject = mupdf .pdf_load_object (pdf , xref1 )
1861
+ wobject .pdf_dict_put_text_string (pymupdf .PDF_NAME ("T" ), newname )
1862
+
1863
+ clean_kid_parents (acro_fields )
1864
+
1695
1865
def get_acroform (doc ):
1696
1866
"""Retrieve the AcroForm dictionary form a PDF."""
1697
1867
pdf = mupdf .pdf_document_from_fz_document (doc )
@@ -1702,56 +1872,79 @@ def get_acroform(doc):
1702
1872
srcpdf = mupdf .pdf_document_from_fz_document (src )
1703
1873
1704
1874
if tar .is_form_pdf :
1705
- # target is a Form PDF, so use its AcroForm to include source fields
1875
+ # target is a Form PDF, so use it to include source fields
1706
1876
acro = get_acroform (tar )
1707
- # Important arrays of indirect objects
1708
- tar_fields = mupdf .pdf_dict_get (acro , pymupdf .PDF_NAME ("Fields" ))
1709
- tar_co = mupdf .pdf_dict_get (acro , pymupdf .PDF_NAME ("CO" ))
1710
- if not mupdf .pdf_is_array (tar_co ):
1711
- tar_co = mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("CO" ), 5 )
1877
+ # Important arrays in AcroForm
1878
+ acro_fields = acro .pdf_dict_get (pymupdf .PDF_NAME ("Fields" ))
1879
+ tar_co = acro .pdf_dict_get (pymupdf .PDF_NAME ("CO" ))
1880
+ if not tar_co .pdf_is_array ():
1881
+ tar_co = acro .pdf_dict_put_array (pymupdf .PDF_NAME ("CO" ), 5 )
1712
1882
else :
1713
1883
# target is no Form PDF, so copy over source AcroForm
1714
1884
acro = mupdf .pdf_deep_copy_obj (get_acroform (src )) # make a copy
1715
1885
1716
1886
# Clear "Fields" and "CO" arrays: will be populated by page fields.
1717
1887
# This is required to avoid copying unneeded objects.
1718
- mupdf .pdf_dict_del (acro , pymupdf .PDF_NAME ("Fields" ))
1719
- mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("Fields" ), 5 )
1720
- mupdf .pdf_dict_del (acro , pymupdf .PDF_NAME ("CO" ))
1721
- mupdf .pdf_dict_put_array (acro , pymupdf .PDF_NAME ("CO" ), 5 )
1888
+ acro .pdf_dict_del (pymupdf .PDF_NAME ("Fields" ))
1889
+ acro .pdf_dict_put_array (pymupdf .PDF_NAME ("Fields" ), 5 )
1890
+ acro .pdf_dict_del (pymupdf .PDF_NAME ("CO" ))
1891
+ acro .pdf_dict_put_array (pymupdf .PDF_NAME ("CO" ), 5 )
1722
1892
1723
1893
# Enrich AcroForm for copying to target
1724
1894
acro_graft = mupdf .pdf_graft_mapped_object (graftmap , acro )
1725
1895
1726
1896
# Insert AcroForm into target PDF
1727
1897
acro_tar = mupdf .pdf_add_object (tarpdf , acro_graft )
1728
- tar_fields = mupdf .pdf_dict_get (acro_tar , pymupdf .PDF_NAME ("Fields" ))
1729
- tar_co = mupdf .pdf_dict_get (acro_tar , pymupdf .PDF_NAME ("CO" ))
1898
+ acro_fields = acro_tar .pdf_dict_get (pymupdf .PDF_NAME ("Fields" ))
1899
+ tar_co = acro_tar .pdf_dict_get (pymupdf .PDF_NAME ("CO" ))
1730
1900
1731
1901
# get its xref and insert it into target catalog
1732
- tar_xref = mupdf .pdf_to_num (acro_tar )
1902
+ tar_xref = acro_tar .pdf_to_num ()
1733
1903
acro_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1734
1904
root = mupdf .pdf_dict_get (mupdf .pdf_trailer (tarpdf ), pymupdf .PDF_NAME ("Root" ))
1735
- mupdf .pdf_dict_put (root , pymupdf .PDF_NAME ("AcroForm" ), acro_tar_ind )
1905
+ root .pdf_dict_put (pymupdf .PDF_NAME ("AcroForm" ), acro_tar_ind )
1736
1906
1737
1907
if from_page <= to_page :
1738
1908
src_range = range (from_page , to_page + 1 )
1739
1909
else :
1740
1910
src_range = range (from_page , to_page - 1 , - 1 )
1741
1911
1742
- for i in range (len (src_range )):
1743
- # read first page that was copied over
1744
- tar_page = tar [start_at + i ]
1745
-
1746
- # convert it to a formal PDF page
1747
- tar_page_pdf = mupdf .pdf_page_from_fz_page (tar_page )
1912
+ parents = {} # information about widget parents
1748
1913
1749
- # extract its annotations array
1750
- tar_annots = mupdf .pdf_dict_get (tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ))
1751
- if not mupdf .pdf_is_array (tar_annots ):
1752
- tar_annots = mupdf .pdf_dict_put_array (
1753
- tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ), 5
1754
- )
1914
+ # remove "P" owning page reference from all widgets of all source pages
1915
+ for i in src_range :
1916
+ src_page = src [src_range [i ]]
1917
+ for xref in [
1918
+ xref
1919
+ for xref , wtype , _ in src_page .annot_xrefs ()
1920
+ if wtype == pymupdf .PDF_ANNOT_WIDGET # pylint: disable=no-member
1921
+ ]:
1922
+ w_obj = mupdf .pdf_load_object (srcpdf , xref )
1923
+ w_obj .pdf_dict_del (pymupdf .PDF_NAME ("P" ))
1924
+
1925
+ # get the widget's parent structure
1926
+ parent_xref , old_kids = kids_xrefs (w_obj )
1927
+ if parent_xref :
1928
+ parents [parent_xref ] = {
1929
+ "new_xref" : 0 ,
1930
+ "old_kids" : old_kids ,
1931
+ "new_kids" : [],
1932
+ }
1933
+ # Copy over Parent widgets first - they are not page-dependent
1934
+ for xref in parents .keys (): # pylint: disable=consider-using-dict-items
1935
+ parent = mupdf .pdf_load_object (srcpdf , xref )
1936
+ parent_graft = mupdf .pdf_graft_mapped_object (graftmap , parent )
1937
+ parent_tar = mupdf .pdf_add_object (tarpdf , parent_graft )
1938
+ kids_xrefs_new = get_kids (parent_tar , [])
1939
+ parent_xref_new = parent_tar .pdf_to_num ()
1940
+ parent_ind = mupdf .pdf_new_indirect (tarpdf , parent_xref_new , 0 )
1941
+ acro_fields .pdf_array_push (parent_ind )
1942
+ parents [xref ]["new_xref" ] = parent_xref_new
1943
+ parents [xref ]["new_kids" ] = kids_xrefs_new
1944
+
1945
+ for i in src_range :
1946
+ # read first copied over page in target
1947
+ tar_page = tar [start_at + i ]
1755
1948
1756
1949
# read the original page in the source PDF
1757
1950
src_page = src [src_range [i ]]
@@ -1762,44 +1955,48 @@ def get_acroform(doc):
1762
1955
for xref , wtype , _ in src_page .annot_xrefs ()
1763
1956
if wtype == pymupdf .PDF_ANNOT_WIDGET # pylint: disable=no-member
1764
1957
]
1958
+ if not w_xrefs : # no widgets on this source page
1959
+ continue
1765
1960
1766
- # Remove page references from widgets to prevent duplicate copies
1767
- # of the page in the target.
1768
- for xref in w_xrefs :
1769
- w_obj = mupdf .pdf_load_object (srcpdf , xref )
1770
- mupdf .pdf_dict_del (w_obj , pymupdf .PDF_NAME ("P" ))
1961
+ # convert to formal PDF page
1962
+ tar_page_pdf = mupdf .pdf_page_from_fz_page (tar_page )
1963
+
1964
+ # extract annotations array
1965
+ tar_annots = mupdf .pdf_dict_get (tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ))
1966
+ if not mupdf .pdf_is_array (tar_annots ):
1967
+ tar_annots = mupdf .pdf_dict_put_array (
1968
+ tar_page_pdf .obj (), pymupdf .PDF_NAME ("Annots" ), 5
1969
+ )
1771
1970
1772
1971
for xref in w_xrefs :
1773
1972
w_obj = mupdf .pdf_load_object (srcpdf , xref )
1774
1973
1775
- # check if field is a member of inter-field validations
1776
- temp = mupdf .pdf_dict_getp (w_obj , "AA/C" )
1777
- if mupdf .pdf_is_dict (temp ):
1778
- is_aac = True
1779
- else :
1780
- is_aac = False
1781
-
1782
- # recursively complete the widget object with all referenced objects
1783
- w_obj_graft = mupdf .pdf_graft_mapped_object (graftmap , w_obj )
1784
-
1785
- # add the completed widget object to the target PDF
1786
- w_obj_tar = mupdf .pdf_add_object (tarpdf , w_obj_graft )
1787
-
1788
- # extract its generated target xref number
1789
- tar_xref = mupdf .pdf_to_num (w_obj_tar )
1974
+ # check if field takes part in inter-field validations
1975
+ is_aac = mupdf .pdf_is_dict (mupdf .pdf_dict_getp (w_obj , "AA/C" ))
1790
1976
1791
- # create an indirect object from it
1792
- w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1793
-
1794
- # insert this xref reference into the page,
1795
- mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
1977
+ # check if parent of widget already in target
1978
+ parent_xref = mupdf .pdf_to_num (
1979
+ w_obj .pdf_dict_get (pymupdf .PDF_NAME ("Parent" ))
1980
+ )
1981
+ if parent_xref == 0 : # parent not in target yet
1982
+ w_obj_graft = mupdf .pdf_graft_mapped_object (graftmap , w_obj )
1983
+ w_obj_tar = mupdf .pdf_add_object (tarpdf , w_obj_graft )
1984
+ tar_xref = w_obj_tar .pdf_to_num ()
1985
+ w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1986
+ mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
1987
+ mupdf .pdf_array_push (acro_fields , w_obj_tar_ind )
1988
+ else :
1989
+ parent = parents [parent_xref ]
1990
+ idx = parent ["old_kids" ].index (xref ) # search for xref in parent
1991
+ tar_xref = parent ["new_kids" ][idx ]
1992
+ w_obj_tar_ind = mupdf .pdf_new_indirect (tarpdf , tar_xref , 0 )
1993
+ mupdf .pdf_array_push (tar_annots , w_obj_tar_ind )
1796
1994
1797
- # and also into "AcroForm/Fields",
1798
- mupdf .pdf_array_push (tar_fields , w_obj_tar_ind )
1799
- # and also into "AcroForm/CO" if a computation field.
1995
+ # Into "AcroForm/CO" if a computation field.
1800
1996
if is_aac :
1801
1997
mupdf .pdf_array_push (tar_co , w_obj_tar_ind )
1802
1998
1999
+ deduplicate_names (tarpdf , acro_fields , join_duplicates = join_duplicates )
1803
2000
1804
2001
def do_links (
1805
2002
doc1 : pymupdf .Document ,
0 commit comments