Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pypdf/_font.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import dataclass, field
from typing import Any, Union, cast

from pypdf.generic import ArrayObject, DictionaryObject
from pypdf.generic import ArrayObject, DictionaryObject, NameObject

from ._cmap import get_encoding
from ._codecs.adobe_glyphs import adobe_glyphs
Expand Down Expand Up @@ -302,6 +302,18 @@ def from_font_resource(
interpretable=interpretable
)

def as_font_resource(self) -> DictionaryObject:
# For now, this returns a font resource that only works with the 14 Adobe Core fonts.
return (
DictionaryObject({
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/Name"): NameObject(f"/{self.name}"),
NameObject("/Type"): NameObject("/Font"),
NameObject("/BaseFont"): NameObject(f"/{self.name}"),
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
})
)

def text_width(self, text: str = "") -> float:
"""Sum of character widths specified in PDF font for the supplied text."""
return sum(
Expand Down
10 changes: 6 additions & 4 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,12 +902,14 @@ def _add_apstream_object(
ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"])
ap_stream_font_dict = cast(DictionaryObject, ap_stream_res.get("/Font", DictionaryObject()))
if "/Font" not in pg_res:
pg_res[NameObject("/Font")] = DictionaryObject()
pg_font_res = cast(DictionaryObject, pg_res["/Font"])
font_dict_ref = self._add_object(DictionaryObject())
pg_res[NameObject("/Font")] = font_dict_ref
pg_font_res = cast(DictionaryObject, pg_res["/Font"].get_object())
# Merge fonts from the appearance stream into the page's font resources
for font_name, font_ref in ap_stream_font_dict.items():
for font_name, font_res in ap_stream_font_dict.items():
if font_name not in pg_font_res:
pg_font_res[font_name] = font_ref
font_res_ref = self._add_object(font_res)
pg_font_res[font_name] = font_res_ref
# Always add the resolved stream object to the writer to get a new IndirectObject.
# This ensures we have a valid IndirectObject managed by *this* writer.
xobject_ref = self._add_object(appearance_stream_obj)
Expand Down
146 changes: 95 additions & 51 deletions pypdf/generic/_appearance_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,25 +343,43 @@ def __init__(

# If a font resource was added, get the font character map
if font_resource:
font_resource = cast(DictionaryObject, font_resource.get_object())
font = Font.from_font_resource(font_resource)
else:
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
font_name = "/Helv"
font_resource = DictionaryObject({
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/Name"): NameObject("/Helv"),
NameObject("/Type"): NameObject("/Font"),
NameObject("/BaseFont"): NameObject("/Helvetica"),
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
})
core_font_metrics = CORE_FONT_METRICS["Helvetica"]
font = Font(
name="Helvetica",
character_map={},
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
sub_type="Type1",
font_descriptor=CORE_FONT_METRICS["Helvetica"].font_descriptor,
character_widths=CORE_FONT_METRICS["Helvetica"].character_widths
font_descriptor=core_font_metrics.font_descriptor,
character_widths=core_font_metrics.character_widths
)
font_resource = font.as_font_resource()

# Check whether the font resource is able to encode the text value.
encodable = True
try:
if isinstance(font.encoding, str):
text.encode(font.encoding, "surrogatepass")
else:
supported_chars = set(font.encoding.values())
if any(char not in supported_chars for char in text):
encodable = False
# We should add a final check against the character_map (CMap) of the font,
# but we don't appear to have PDF forms with such fonts, so we skip this for
# now.

except UnicodeEncodeError:
encodable = False

if not encodable:
logger_warning(
f"Text string '{text}' contains characters not supported by font encoding. "
"This may result in text corruption. "
"Consider calling writer.update_page_form_field_values with auto_regenerate=True.",
__name__
)

font_glyph_byte_map: dict[str, bytes]
Expand Down Expand Up @@ -398,6 +416,44 @@ def __init__(
})
})

@staticmethod
def _find_annotation_font_resource(
font_name: str,
annotation: DictionaryObject,
acro_form: DictionaryObject
) -> tuple[str, DictionaryObject]:
# Try to find a resource dictionary for the font by examining the annotation and, if that fails,
# the AcroForm resources dictionary
acro_form_resources: Any = cast(
DictionaryObject,
annotation.get_inherited(
"/DR",
acro_form.get("/DR", DictionaryObject()),
),
)
acro_form_font_resources = acro_form_resources.get("/Font", DictionaryObject())
font_resource = acro_form_font_resources.get(font_name, None)

# Normally, we should have found a font resource by now. However, when a user has provided a specific
# font name, we may not have found the associated font resource among the AcroForm resources. Also, in
# case of the 14 Adobe Core fonts, we may be expected to construct a font resource ourselves.
if is_null_or_none(font_resource):
if font_name.removeprefix("/") not in CORE_FONT_METRICS:
# Default to Helvetica if we haven't found a font resource and cannot construct one ourselves.
logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__)
font_name = "/Helvetica"
core_font_metrics = CORE_FONT_METRICS[font_name.removeprefix("/")]
font_resource = Font(
name=font_name.removeprefix("/"),
character_map={},
encoding=dict(zip(range(256), fill_from_encoding("cp1252"))), # WinAnsiEncoding
sub_type="Type1",
font_descriptor=core_font_metrics.font_descriptor,
character_widths=core_font_metrics.character_widths
).as_font_resource()

return font_name, font_resource

@classmethod
def from_text_annotation(
cls,
Expand Down Expand Up @@ -443,6 +499,23 @@ def from_text_annotation(
else:
default_appearance = default_appearance.get_object()

# Retrieve field text and selected values
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
if (
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
):
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
selection = field.get("/V", [])
if not isinstance(selection, list):
selection = [selection]
else: # /Tx
text = field.get("/V", "")
selection = []

# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")

# Derive font name, size and color from the default appearance. Also set
# user-provided font name and font size in the default appearance, if given.
# For a font name, this presumes that we can find an associated font resource
Expand All @@ -463,46 +536,7 @@ def from_text_annotation(
if user_font_size > 0:
font_size = user_font_size

# Try to find a resource dictionary for the font
document_resources: Any = cast(
DictionaryObject,
cast(
DictionaryObject,
annotation.get_inherited(
"/DR",
acro_form.get("/DR", DictionaryObject()),
),
).get_object(),
)
document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object()
# CORE_FONT_METRICS is the dict with Standard font metrics
if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS:
# ...or AcroForm dictionary
document_resources = cast(
dict[Any, Any],
acro_form.get("/DR", {}),
)
document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object()
font_resource = document_font_resources.get(font_name, None)
if not is_null_or_none(font_resource):
font_resource = cast(DictionaryObject, font_resource.get_object())

# Retrieve field text and selected values
field_flags = field.get(FieldDictionaryAttributes.Ff, 0)
if (
field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and
field_flags & FieldDictionaryAttributes.FfBits.Combo == 0
):
text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, []))
selection = field.get("/V", [])
if not isinstance(selection, list):
selection = [selection]
else: # /Tx
text = field.get("/V", "")
selection = []

# Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings)
text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)")
font_name, font_resource = cls._find_annotation_font_resource(font_name, annotation, acro_form)

# Retrieve formatting information
is_comb = False
Expand Down Expand Up @@ -535,11 +569,21 @@ def from_text_annotation(
is_comb=is_comb,
max_length=max_length
)

if AnnotationDictionaryAttributes.AP in annotation:
for key, value in (
cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items()
):
if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
if key in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}:
continue
# Don't overwrite font resources added by TextAppearanceStream.__init__
if key == "/Resources":
if "/Font" not in value:
value.get_object()[NameObject("/Font")] = DictionaryObject()
value["/Font"].get_object()[NameObject(font_name)] = getattr(
font_resource, "indirect_reference", font_resource
)
else:
new_appearance_stream[key] = value

return new_appearance_stream
20 changes: 11 additions & 9 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1506,7 +1506,7 @@ def test_named_dest_page_number():
assert len(writer.root_object["/Names"]["/Dests"]["/Names"]) == 6


def test_update_form_fields(tmp_path):
def test_update_form_fields(caplog, tmp_path):
write_data_here = tmp_path / "out.pdf"
writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf")
writer.update_page_form_field_values(
Expand Down Expand Up @@ -1572,10 +1572,11 @@ def test_update_form_fields(tmp_path):
del writer.root_object["/AcroForm"]["/Fields"][1].get_object()["/DR"]["/Font"]
writer.update_page_form_field_values(
[writer.pages[0], writer.pages[1]],
{"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
{"Text1": "!مرحبا بالعالم", "Text2": "ligne1\nligne2\nligne3"},
auto_regenerate=False,
)
assert b"/Helv " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
assert b"/Helvetica " in writer.pages[1]["/Annots"][1]["/AP"]["/N"].get_data()
assert "Text string '!مرحبا بالعالم' contains characters not supported by font encoding." in caplog.text
writer.update_page_form_field_values(
None,
{"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
Expand Down Expand Up @@ -1646,7 +1647,7 @@ def test_merge_content_stream_to_page():


@pytest.mark.enable_socket
def test_update_form_fields2():
def test_update_form_fields2(caplog):
my_files = {
"test1": {
"name": "Test1 Form",
Expand Down Expand Up @@ -1679,7 +1680,7 @@ def test_update_form_fields2():
"Initial": "JSS",
# "p2 I DO NOT Agree": "null",
"p2 Last Name": "Smith",
"p3 First Name": "John",
"p3 First Name": "شهرزاد",
"p3 Middle Name": "R",
"p3 MM": "01",
"p3 DD": "25",
Expand Down Expand Up @@ -1718,12 +1719,13 @@ def test_update_form_fields2():
"test2.Initial": "JSS",
"test2.p2 I DO NOT Agree": None,
"test2.p2 Last Name": "Smith",
"test2.p3 First Name": "John",
"test2.p3 First Name": "شهرزاد",
"test2.p3 Middle Name": "R",
"test2.p3 MM": "01",
"test2.p3 DD": "25",
"test2.p3 YY": "21",
}
assert "Text string 'شهرزاد' contains characters not supported by font encoding." in caplog.text


@pytest.mark.enable_socket
Expand Down Expand Up @@ -2411,7 +2413,7 @@ def test_selfont():


@pytest.mark.enable_socket
def test_no_resource_for_14_std_fonts(caplog):
def test_no_resource_for_14_std_fonts():
"""Cf #2670"""
url = "https://github.com/py-pdf/pypdf/files/15405390/f1040.pdf"
name = "iss2670.pdf"
Expand All @@ -2423,7 +2425,7 @@ def test_no_resource_for_14_std_fonts(caplog):
writer.update_page_form_field_values(
p, {a["/T"]: "Brooks"}, auto_regenerate=False
)
assert "Font dictionary for /Helvetica not found; defaulting to Helvetica." in caplog.text
assert "/Helvetica" in a["/AP"]["/N"]["/Resources"]["/Font"]


@pytest.mark.enable_socket
Expand All @@ -2435,7 +2437,7 @@ def test_field_box_upside_down():
writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"})
assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == (
b"q\n/Tx BMC \nq\n2 1 102.29520000000001 9.835000000000036 re\n"
b"W\nBT\n/Helv 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
b"W\nBT\n/Arial 8.0 Tf 0 g\n2 3.0455000000000183 Td\n(0) Tj\nET\n"
b"Q\nEMC\nQ\n"
)
box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"]
Expand Down