Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 184 additions & 14 deletions pypdf/_font.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
from collections.abc import Sequence
from dataclasses import dataclass, field
from io import BytesIO
from typing import Any, Union, cast

from pypdf.generic import ArrayObject, DictionaryObject, NameObject
from pypdf.generic import (
ArrayObject,
DictionaryObject,
FloatObject,
NameObject,
NumberObject,
StreamObject,
TextStringObject,
)

from ._cmap import get_encoding
from ._codecs.adobe_glyphs import adobe_glyphs
from ._utils import logger_warning
from .constants import FontFlags
from .errors import PdfReadError

try:
from fontTools.ttLib import TTFont
HAS_FONTTOOLS = True
except ImportError:
HAS_FONTTOOLS = False


@dataclass(frozen=True)
Expand All @@ -31,6 +47,26 @@ class FontDescriptor:
italic_angle: float = 0.0 # Non-italic
flags: int = 32 # Non-serif, non-symbolic, not fixed width
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
font_file: Union[StreamObject, None] = None

def as_font_descriptor_resource(self) -> DictionaryObject:
font_descriptor_resource = DictionaryObject({
NameObject("/Type"): NameObject("/FontDescriptor"),
NameObject("/FontName"): NameObject(f"/{self.name}"),
NameObject("/Flags"): NumberObject(self.flags),
NameObject("/FontBBox"): ArrayObject([FloatObject(n) for n in self.bbox]),
NameObject("/ItalicAngle"): FloatObject(self.italic_angle),
NameObject("/Ascent"): FloatObject(self.ascent),
NameObject("/Descent"): FloatObject(self.descent),
NameObject("/CapHeight"): FloatObject(self.cap_height),
NameObject("/XHeight"): FloatObject(self.x_height),
})

if self.font_file:
# Add the stream. For now, we assume a TrueType font (FontFile2)
font_descriptor_resource [NameObject("/FontFile2")] = self.font_file

return font_descriptor_resource


@dataclass(frozen=True)
Expand Down Expand Up @@ -192,6 +228,17 @@ def _add_default_width(current_widths: dict[str, int], flags: int) -> None:
valid_widths = [w for w in current_widths.values() if w > 0]
current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500

@staticmethod
def _add_space_width(character_widths: dict[str, int], flags: int) -> int:
space_width = character_widths.get(" ", 0)
if space_width != 0:
return space_width

if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
return character_widths["default"]

return character_widths["default"] // 2

@staticmethod
def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
font_descriptor_kwargs: dict[Any, Any] = {}
Expand All @@ -214,6 +261,18 @@ def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, A
bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))
assert len(bbox_tuple) == 4, bbox_tuple
font_descriptor_kwargs["bbox"] = bbox_tuple

# Find the binary stream for this font if there is one
for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:
if source_key in font_descriptor_obj:
if "font_file" in font_descriptor_kwargs:
raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")

try:
font_file = font_descriptor_obj[source_key].get_object()
font_descriptor_kwargs["font_file"] = font_file
except PdfReadError as e:
logger_warning(f"Failed to get '{source_key}' in {font_descriptor_obj}: {e}", __name__)
return font_descriptor_kwargs

@classmethod
Expand Down Expand Up @@ -284,12 +343,8 @@ def from_font_resource(

if character_widths.get("default", 0) == 0:
cls._add_default_width(character_widths, font_descriptor.flags)
space_width = character_widths.get(" ", 0)
if space_width == 0:
if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
space_width = character_widths["default"]
else:
space_width = character_widths["default"] // 2

space_width = cls._add_space_width(character_widths, font_descriptor.flags)

return cls(
name=name,
Expand All @@ -302,17 +357,132 @@ def from_font_resource(
interpretable=interpretable
)

@classmethod
def from_truetype_font_file(cls, font_file: BytesIO) -> "Font":
with TTFont(font_file) as tt_font_object:
header = tt_font_object["head"]
names = tt_font_object["name"]
postscript_info = tt_font_object["post"]
horizontal_header = tt_font_object["hhea"]
os_2 = tt_font_object["OS/2"]
metrics = tt_font_object["hmtx"].metrics

# Get the scaling factor to convert font file's units per em to PDF's 1000 units per em
units_per_em = header.unitsPerEm
scale_factor = 1000.0 / units_per_em

# Get the font descriptor
font_descriptor_kwargs: dict[Any, Any] = {}
font_descriptor_kwargs["name"] = names.getDebugName(6) or names.getDebugName(1) # PostScript name
font_descriptor_kwargs["family"] = names.getDebugName(16) or names.getDebugName(1) # Prefer typographic
font_descriptor_kwargs["weight"] = names.getDebugName(17) or names.getDebugName(2) # names
font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))
font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))
font_descriptor_kwargs["cap_height"] = int(round(os_2.sCapHeight * scale_factor, 0))
font_descriptor_kwargs["x_height"] = int(round(os_2.sxHeight * scale_factor, 0))

# Get the font flags
flags: int = 0
italic_angle = postscript_info.italicAngle
if italic_angle != 0.0:
flags |= FontFlags.ITALIC
if postscript_info.isFixedPitch > 0:
flags |= FontFlags.FIXED_PITCH

# See Chapter 6 of the TrueType reference manual for the definition of the OS/2 table:
# https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html
family_class = os_2.sFamilyClass >> 8
if 2 <= family_class <= 9 and family_class != 6:
flags |= FontFlags.SERIF
if family_class == 10:
flags |= FontFlags.SCRIPT
if family_class == 12:
flags |= FontFlags.SYMBOLIC
else:
flags |= FontFlags.NONSYMBOLIC
font_descriptor_kwargs["flags"] = flags

font_descriptor_kwargs["bbox"] = (
round(header.xMin * scale_factor, 0),
round(header.yMin * scale_factor, 0),
round(header.xMax * scale_factor, 0),
round(header.yMax * scale_factor, 0)
)

font_file_data = StreamObject()
font_file_raw_bytes = font_file.getvalue()
font_file_data.set_data(font_file_raw_bytes)
font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})
font_descriptor_kwargs["font_file"] = font_file_data

font_descriptor = FontDescriptor(**font_descriptor_kwargs)
character_map = {chr(key): value for key, value in tt_font_object.getBestCmap().items()}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure that this is not correct. It accidentally works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what Claude AI says:

The Problem with /Identity CIDToGIDMap

The issue is on line 467 of the PR:

cid_font[NameObject("/CIDToGIDMap")] = NameObject("/Identity")

Why this causes garbled text:

When you set /CIDToGIDMap to just /Identity, the PDF reader assumes:

  • CID (Character ID) = Unicode codepoint (from your character_map)
  • GID (Glyph ID) = the same value

However, in a TrueType font file, the glyph IDs don't necessarily match Unicode codepoints. Looking at your code:

character_map = {chr(key): value for key, value in tt_font_object.getBestCmap().items()}

The character_map maps:

  • Keys: Unicode characters (from the font's cmap table)
  • Values: Glyph IDs in that font file

But when you later encode text using this map and then tell the PDF reader "use /Identity mapping," the reader will try to use the Unicode codepoint as the GID directly—not the glyph ID stored in your character_map. This causes mismatches where the wrong glyphs get rendered.

The Correct Solution

You need to create an explicit CIDToGIDMap stream that maps:

  • Input: Character ID (Unicode codepoint)
  • Output: Glyph ID (from the TrueType font)

Here's the approach:

# Build the CIDToGIDMap stream
cid_to_gid_map_bytes = bytearray()
for unicode_codepoint in sorted(character_map.keys(), key=ord):
    glyph_id = character_map[unicode_codepoint]
    # Encode as 2-byte big-endian (PDF standard for CIDToGIDMap)
    cid_to_gid_map_bytes.extend(glyph_id.to_bytes(2, byteorder='big'))

cid_to_gid_map_stream = StreamObject()
cid_to_gid_map_stream.set_data(bytes(cid_to_gid_map_bytes))
cid_font[NameObject("/CIDToGIDMap")] = cid_to_gid_map_stream

This ensures every character in your character_map has a corresponding, correct glyph ID lookup in the PDF.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably not correct either.

encoding = "utf_16_be" # Assume unicode

character_widths: dict[str, int] = {}
for character, glyph in character_map.items():
character_widths[character] = int(round(metrics[glyph][0] * scale_factor, 0))
cls._add_default_width(character_widths, flags)
space_width = cls._add_space_width(character_widths, flags)

return cls(
name=font_descriptor.name,
sub_type="TrueType",
encoding=encoding,
font_descriptor=font_descriptor,
character_map=character_map,
character_widths=character_widths,
space_width=space_width,
interpretable=True
)

def as_font_resource(self) -> DictionaryObject:
# For now, this returns a font resource that only works with the 14 Adobe Core fonts.
return (
DictionaryObject({
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/Name"): NameObject(f"/{self.name}"),
# If we have an embedded Truetype font, we assume that we need to produce a Type 2 CID font resource.
if self.font_descriptor.font_file and self.sub_type == "TrueType":
# Create the descendant font, using Identity mapping
cid_font = DictionaryObject({
NameObject("/Type"): NameObject("/Font"),
NameObject("/Subtype"): NameObject("/CIDFontType2"),
NameObject("/BaseFont"): NameObject(f"/{self.name}"),
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
NameObject("/CIDSystemInfo"): DictionaryObject({
NameObject("/Registry"): TextStringObject("Adobe"), # Should be something read from font file
NameObject("/Ordering"): TextStringObject("Identity"),
NameObject("/Supplement"): NumberObject(0)
}),
# "/FontDescriptor" should be an IndirectObject.
NameObject("/FontDescriptor"): self.font_descriptor.as_font_descriptor_resource()
})
)

# Build the widths (/W) array. This can have to formats:
# [first_cid [w1 w2 w3]] or [first last width]
# Here we choose the first format and simply provide one array with one width for every cid.
widths_list = []
for char, width in self.character_widths.items():
if char != "default":
cid = ord(char)
widths_list.extend([NumberObject(cid), ArrayObject([NumberObject(width)])])

cid_font[NameObject("/W")] = ArrayObject(widths_list)
cid_font[NameObject("/DW")] = NumberObject(self.character_widths.get("default", 1000))
cid_font[NameObject("/CIDToGIDMap")] = NameObject("/Identity")

# Create the Type 0 font object)
return DictionaryObject({
NameObject("/Type"): NameObject("/Font"),
NameObject("/Subtype"): NameObject("/Type0"),
NameObject("/BaseFont"): NameObject(f"/{self.name}"),
NameObject("/Encoding"): NameObject("/Identity-H"),
NameObject("/DescendantFonts"): ArrayObject([cid_font]),
})

# Fallback: Return a font resource for the 14 Adobe Core fonts.
return DictionaryObject({
NameObject("/Type"): NameObject("/Font"),
NameObject("/Subtype"): NameObject("/Type1"),
NameObject("/Name"): NameObject(f"/{self.name}"),
NameObject("/BaseFont"): NameObject(f"/{self.name}"),
NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
})

def text_width(self, text: str = "") -> float:
"""Sum of character widths specified in PDF font for the supplied text."""
Expand Down
20 changes: 20 additions & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,22 @@ def _add_apstream_object(
xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
self._merge_content_stream_to_page(page, xobject_drawing_commands)

def _make_font_descriptors_indirect(self, appearance_stream_object: StreamObject) -> None:
font_resources = cast(DictionaryObject, cast(DictionaryObject, appearance_stream_object["/Resources"])["/Font"])
for font_resource in font_resources:
font_resource_object = cast(DictionaryObject, font_resources[font_resource])
if "/DescendantFonts" in font_resource_object:
descendant_fonts = cast(ArrayObject, font_resource_object["/DescendantFonts"])
font_resource_dict = cast(DictionaryObject, descendant_fonts[0])
else:
font_resource_dict = font_resource_object
if "/FontDescriptor" in font_resource_dict and not isinstance(
font_resource_dict.raw_get("/FontDescriptor"), IndirectObject
):
font_resource_dict[NameObject("/FontDescriptor")] = self._add_object(
font_resource_dict["/FontDescriptor"]
)

FFBITS_NUL = FA.FfBits(0)

def update_page_form_field_values(
Expand Down Expand Up @@ -1068,6 +1084,10 @@ def update_page_form_field_values(
annotation.get(FA.FT) == "/Sig"
): # deprecated # not implemented yet
logger_warning("Signature forms not implemented yet", __name__)

# Make font resources and font descriptors indirect objects
if appearance_stream_obj and "/Font" in cast(DictionaryObject, appearance_stream_obj["/Resources"]):
self._make_font_descriptors_indirect(appearance_stream_obj)
if flatten and appearance_stream_obj is not None:
self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])

Expand Down
19 changes: 15 additions & 4 deletions pypdf/generic/_appearance_stream.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import re
from dataclasses import dataclass
from enum import IntEnum
from io import BytesIO
from typing import Any, Optional, Union, cast

from .._codecs import fill_from_encoding
from .._codecs.core_font_metrics import CORE_FONT_METRICS
from .._font import Font
from .._font import HAS_FONTTOOLS, Font
from .._utils import logger_warning
from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
from ..generic import (
Expand Down Expand Up @@ -374,6 +375,16 @@ def __init__(
except UnicodeEncodeError:
encodable = False

if not encodable and font.font_descriptor.font_file and HAS_FONTTOOLS and font.sub_type == "TrueType":
# If we have a font file, we can try to produce a new font resource with an encoding
# that does include the necessary characters.
font = font.from_truetype_font_file(BytesIO(font.font_descriptor.font_file.get_data()))
font_resource = font.as_font_resource()
font_name = f"/{font.name}"
supported_chars = set(font.character_map.keys())
if all(char in supported_chars for char in text):
encodable = True

if not encodable:
logger_warning(
f"Text string '{text}' contains characters not supported by font encoding. "
Expand All @@ -384,9 +395,9 @@ def __init__(

font_glyph_byte_map: dict[str, bytes]
if isinstance(font.encoding, str):
font_glyph_byte_map = {
v: k.encode(font.encoding) for k, v in font.character_map.items()
}
font_glyph_byte_map = {}
for key, value in font.character_map.items():
font_glyph_byte_map[value] = key.encode(font.encoding)
else:
font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cryptodome = ["PyCryptodome"]
image = ["Pillow>=8.0.0"]
full = [
"cryptography",
"fonttools",
"Pillow>=8.0.0"
]
dev = [
Expand Down
12 changes: 11 additions & 1 deletion tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@
from pypdf._codecs import charset_encoding
from pypdf._font import Font
from pypdf.errors import LimitReachedError
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject, StreamObject
from pypdf.generic import (
ArrayObject,
DictionaryObject,
EncodedStreamObject,
IndirectObject,
NameObject,
NullObject,
StreamObject,
)

from . import RESOURCE_ROOT, get_data_from_url

Expand Down Expand Up @@ -135,6 +143,8 @@ def test_iss1533():
reader.pages[0].extract_text() # no error
font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"])
assert font.character_map["\x01"] == "Ü"
assert isinstance(font.font_descriptor.font_file, EncodedStreamObject)
assert font.font_descriptor.font_file["/Subtype"] == "/CIDFontType0C"


@pytest.mark.enable_socket
Expand Down
Loading