py-pdf · PJBrs · Jan 16, 2026 · Feb 20, 2026 · Feb 21, 2026 · Feb 20, 2026
diff --git a/pypdf/_font.py b/pypdf/_font.py
@@ -1,13 +1,29 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field
+from io import BytesIO
 from typing import Any, Union, cast
 
-from pypdf.generic import ArrayObject, DictionaryObject, NameObject
+from pypdf.generic import (
+    ArrayObject,
+    DictionaryObject,
+    FloatObject,
+    NameObject,
+    NumberObject,
+    StreamObject,
+    TextStringObject,
+)
 
 from ._cmap import get_encoding
 from ._codecs.adobe_glyphs import adobe_glyphs
 from ._utils import logger_warning
 from .constants import FontFlags
+from .errors import PdfReadError
+
+try:
+    from fontTools.ttLib import TTFont
+    HAS_FONTTOOLS = True
+except ImportError:
+    HAS_FONTTOOLS = False
 
 
 @dataclass(frozen=True)
@@ -31,6 +47,26 @@ class FontDescriptor:
     italic_angle: float = 0.0  # Non-italic
     flags: int = 32  # Non-serif, non-symbolic, not fixed width
     bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))
+    font_file: Union[StreamObject, None] = None
+
+    def as_font_descriptor_resource(self) -> DictionaryObject:
+        font_descriptor_resource = DictionaryObject({
+            NameObject("/Type"): NameObject("/FontDescriptor"),
+            NameObject("/FontName"): NameObject(f"/{self.name}"),
+            NameObject("/Flags"): NumberObject(self.flags),
+            NameObject("/FontBBox"): ArrayObject([FloatObject(n) for n in self.bbox]),
+            NameObject("/ItalicAngle"): FloatObject(self.italic_angle),
+            NameObject("/Ascent"): FloatObject(self.ascent),
+            NameObject("/Descent"): FloatObject(self.descent),
+            NameObject("/CapHeight"): FloatObject(self.cap_height),
+            NameObject("/XHeight"): FloatObject(self.x_height),
+        })
+
+        if self.font_file:
+            # Add the stream. For now, we assume a TrueType font (FontFile2)
+            font_descriptor_resource [NameObject("/FontFile2")] = self.font_file
+
+        return font_descriptor_resource
 
 
 @dataclass(frozen=True)
@@ -192,6 +228,17 @@ def _add_default_width(current_widths: dict[str, int], flags: int) -> None:
         valid_widths = [w for w in current_widths.values() if w > 0]
         current_widths["default"] = sum(valid_widths) // len(valid_widths) if valid_widths else 500
 
+    @staticmethod
+    def _add_space_width(character_widths: dict[str, int], flags: int) -> int:
+        space_width = character_widths.get(" ", 0)
+        if space_width != 0:
+            return space_width
+
+        if (flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
+            return character_widths["default"]
+
+        return character_widths["default"] // 2
+
     @staticmethod
     def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
         font_descriptor_kwargs: dict[Any, Any] = {}
@@ -214,6 +261,18 @@ def _parse_font_descriptor(font_descriptor_obj: DictionaryObject) -> dict[str, A
             bbox_tuple = tuple(map(float, font_descriptor_kwargs["bbox"]))
             assert len(bbox_tuple) == 4, bbox_tuple
             font_descriptor_kwargs["bbox"] = bbox_tuple
+
+        # Find the binary stream for this font if there is one
+        for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:
+            if source_key in font_descriptor_obj:
+                if "font_file" in font_descriptor_kwargs:
+                    raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")
+
+                try:
+                    font_file = font_descriptor_obj[source_key].get_object()
+                    font_descriptor_kwargs["font_file"] = font_file
+                except PdfReadError as e:
+                    logger_warning(f"Failed to get '{source_key}' in {font_descriptor_obj}: {e}", __name__)
         return font_descriptor_kwargs
 
     @classmethod
@@ -284,12 +343,8 @@ def from_font_resource(
 
         if character_widths.get("default", 0) == 0:
             cls._add_default_width(character_widths, font_descriptor.flags)
-        space_width = character_widths.get(" ", 0)
-        if space_width == 0:
-            if (font_descriptor.flags & FontFlags.FIXED_PITCH) == FontFlags.FIXED_PITCH:
-                space_width = character_widths["default"]
-            else:
-                space_width = character_widths["default"] // 2
+
+        space_width = cls._add_space_width(character_widths, font_descriptor.flags)
 
         return cls(
             name=name,
@@ -302,17 +357,132 @@ def from_font_resource(
             interpretable=interpretable
         )
 
+    @classmethod
+    def from_truetype_font_file(cls, font_file: BytesIO) -> "Font":
+        with TTFont(font_file) as tt_font_object:
+            header = tt_font_object["head"]
+            names = tt_font_object["name"]
+            postscript_info = tt_font_object["post"]
+            horizontal_header = tt_font_object["hhea"]
+            os_2 = tt_font_object["OS/2"]
+            metrics = tt_font_object["hmtx"].metrics
+
+            # Get the scaling factor to convert font file's units per em to PDF's 1000 units per em
+            units_per_em = header.unitsPerEm
+            scale_factor = 1000.0 / units_per_em
+
+            # Get the font descriptor
+            font_descriptor_kwargs: dict[Any, Any] = {}
+            font_descriptor_kwargs["name"] = names.getDebugName(6) or names.getDebugName(1)  # PostScript name
+            font_descriptor_kwargs["family"] = names.getDebugName(16) or names.getDebugName(1)  # Prefer typographic
+            font_descriptor_kwargs["weight"] = names.getDebugName(17) or names.getDebugName(2)  # names
+            font_descriptor_kwargs["ascent"] = int(round(horizontal_header.ascent * scale_factor, 0))
+            font_descriptor_kwargs["descent"] = int(round(horizontal_header.descent * scale_factor, 0))
+            font_descriptor_kwargs["cap_height"] = int(round(os_2.sCapHeight * scale_factor, 0))
+            font_descriptor_kwargs["x_height"] = int(round(os_2.sxHeight  * scale_factor, 0))
+
+            # Get the font flags
+            flags: int = 0
+            italic_angle = postscript_info.italicAngle
+            if italic_angle != 0.0:
+                flags |= FontFlags.ITALIC
+            if postscript_info.isFixedPitch > 0:
+                flags |= FontFlags.FIXED_PITCH
+
+            # See Chapter 6 of the TrueType reference manual for the definition of the OS/2 table:
+            # https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6OS2.html
+            family_class = os_2.sFamilyClass >> 8
+            if 2 <= family_class <= 9 and family_class != 6:
+                flags |= FontFlags.SERIF
+            if family_class == 10:
+                flags |= FontFlags.SCRIPT
+            if family_class == 12:
+                flags |= FontFlags.SYMBOLIC
+            else:
+                flags |= FontFlags.NONSYMBOLIC
+            font_descriptor_kwargs["flags"] = flags
+
+            font_descriptor_kwargs["bbox"] = (
+                round(header.xMin * scale_factor, 0),
+                round(header.yMin * scale_factor, 0),
+                round(header.xMax * scale_factor, 0),
+                round(header.yMax * scale_factor, 0)
+            )
+
+            font_file_data = StreamObject()
+            font_file_raw_bytes = font_file.getvalue()
+            font_file_data.set_data(font_file_raw_bytes)
+            font_file_data.update({NameObject("/Length1"): NumberObject(len(font_file_raw_bytes))})
+            font_descriptor_kwargs["font_file"] = font_file_data
+
+            font_descriptor = FontDescriptor(**font_descriptor_kwargs)
+            character_map = {chr(key): value for key, value in tt_font_object.getBestCmap().items()}
+            encoding = "utf_16_be"  # Assume unicode
+
+            character_widths: dict[str, int] = {}
+            for character, glyph in character_map.items():
+                character_widths[character] = int(round(metrics[glyph][0] * scale_factor, 0))
+            cls._add_default_width(character_widths, flags)
+            space_width = cls._add_space_width(character_widths, flags)
+
+        return cls(
+            name=font_descriptor.name,
+            sub_type="TrueType",
+            encoding=encoding,
+            font_descriptor=font_descriptor,
+            character_map=character_map,
+            character_widths=character_widths,
+            space_width=space_width,
+            interpretable=True
+        )
+
     def as_font_resource(self) -> DictionaryObject:
-        # For now, this returns a font resource that only works with the 14 Adobe Core fonts.
-        return (
-            DictionaryObject({
-                NameObject("/Subtype"): NameObject("/Type1"),
-                NameObject("/Name"): NameObject(f"/{self.name}"),
+        # If we have an embedded Truetype font, we assume that we need to produce a Type 2 CID font resource.
+        if self.font_descriptor.font_file and self.sub_type == "TrueType":
+            # Create the descendant font, using Identity mapping
+            cid_font = DictionaryObject({
                 NameObject("/Type"): NameObject("/Font"),
+                NameObject("/Subtype"): NameObject("/CIDFontType2"),
                 NameObject("/BaseFont"): NameObject(f"/{self.name}"),
-                NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
+                NameObject("/CIDSystemInfo"): DictionaryObject({
+                    NameObject("/Registry"): TextStringObject("Adobe"),  # Should be something read from font file
+                    NameObject("/Ordering"): TextStringObject("Identity"),
+                    NameObject("/Supplement"): NumberObject(0)
+                }),
+                # "/FontDescriptor" should be an IndirectObject.
+                NameObject("/FontDescriptor"): self.font_descriptor.as_font_descriptor_resource()
             })
-        )
+
+            # Build the widths (/W) array. This can have to formats:
+            # [first_cid [w1 w2 w3]] or [first last width]
+            # Here we choose the first format and simply provide one array with one width for every cid.
+            widths_list = []
+            for char, width in self.character_widths.items():
+                if char != "default":
+                    cid = ord(char)
+                    widths_list.extend([NumberObject(cid), ArrayObject([NumberObject(width)])])
+
+            cid_font[NameObject("/W")] = ArrayObject(widths_list)
+            cid_font[NameObject("/DW")] = NumberObject(self.character_widths.get("default", 1000))
+            cid_font[NameObject("/CIDToGIDMap")] = NameObject("/Identity")
+
+            # Create the Type 0 font object)
+            return DictionaryObject({
+                NameObject("/Type"): NameObject("/Font"),
+                NameObject("/Subtype"): NameObject("/Type0"),
+                NameObject("/BaseFont"): NameObject(f"/{self.name}"),
+                NameObject("/Encoding"): NameObject("/Identity-H"),
+                NameObject("/DescendantFonts"): ArrayObject([cid_font]),
+            })
+
+        # Fallback: Return a font resource for the 14 Adobe Core fonts.
+        return DictionaryObject({
+            NameObject("/Type"): NameObject("/Font"),
+            NameObject("/Subtype"): NameObject("/Type1"),
+            NameObject("/Name"): NameObject(f"/{self.name}"),
+            NameObject("/BaseFont"): NameObject(f"/{self.name}"),
+            NameObject("/Encoding"): NameObject("/WinAnsiEncoding")
+        })
 
     def text_width(self, text: str = "") -> float:
         """Sum of character widths specified in PDF font for the supplied text."""

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -928,6 +928,22 @@ def _add_apstream_object(
         xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
         self._merge_content_stream_to_page(page, xobject_drawing_commands)
 
+    def _make_font_descriptors_indirect(self, appearance_stream_object: StreamObject) -> None:
+        font_resources = cast(DictionaryObject, cast(DictionaryObject, appearance_stream_object["/Resources"])["/Font"])
+        for font_resource in font_resources:
+            font_resource_object = cast(DictionaryObject, font_resources[font_resource])
+            if "/DescendantFonts" in font_resource_object:
+                descendant_fonts = cast(ArrayObject, font_resource_object["/DescendantFonts"])
+                font_resource_dict = cast(DictionaryObject, descendant_fonts[0])
+            else:
+               font_resource_dict = font_resource_object
+            if "/FontDescriptor" in font_resource_dict and not isinstance(
+                font_resource_dict.raw_get("/FontDescriptor"), IndirectObject
+            ):
+                font_resource_dict[NameObject("/FontDescriptor")] = self._add_object(
+                    font_resource_dict["/FontDescriptor"]
+                )
+
     FFBITS_NUL = FA.FfBits(0)
 
     def update_page_form_field_values(
@@ -1068,6 +1084,10 @@ def update_page_form_field_values(
                     annotation.get(FA.FT) == "/Sig"
                 ):  # deprecated  # not implemented yet
                     logger_warning("Signature forms not implemented yet", __name__)
+
+                # Make font resources and font descriptors indirect objects
+                if appearance_stream_obj and "/Font" in cast(DictionaryObject, appearance_stream_obj["/Resources"]):
+                    self._make_font_descriptors_indirect(appearance_stream_obj)
                 if flatten and appearance_stream_obj is not None:
                     self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1])
 

diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py
@@ -1,11 +1,12 @@
 import re
 from dataclasses import dataclass
 from enum import IntEnum
+from io import BytesIO
 from typing import Any, Optional, Union, cast
 
 from .._codecs import fill_from_encoding
 from .._codecs.core_font_metrics import CORE_FONT_METRICS
-from .._font import Font
+from .._font import HAS_FONTTOOLS, Font
 from .._utils import logger_warning
 from ..constants import AnnotationDictionaryAttributes, BorderStyles, FieldDictionaryAttributes
 from ..generic import (
@@ -374,6 +375,16 @@ def __init__(
         except UnicodeEncodeError:
             encodable = False
 
+        if not encodable and font.font_descriptor.font_file and HAS_FONTTOOLS and font.sub_type == "TrueType":
+            # If we have a font file, we can try to produce a new font resource with an encoding
+            # that does include the necessary characters.
+            font = font.from_truetype_font_file(BytesIO(font.font_descriptor.font_file.get_data()))
+            font_resource = font.as_font_resource()
+            font_name = f"/{font.name}"
+            supported_chars = set(font.character_map.keys())
+            if all(char in supported_chars for char in text):
+                encodable = True
+
         if not encodable:
             logger_warning(
                 f"Text string '{text}' contains characters not supported by font encoding. "
@@ -384,9 +395,9 @@ def __init__(
 
         font_glyph_byte_map: dict[str, bytes]
         if isinstance(font.encoding, str):
-            font_glyph_byte_map = {
-                v: k.encode(font.encoding) for k, v in font.character_map.items()
-            }
+            font_glyph_byte_map = {}
+            for key, value in font.character_map.items():
+                font_glyph_byte_map[value] = key.encode(font.encoding)
         else:
             font_glyph_byte_map = {v: bytes((k,)) for k, v in font.encoding.items()}
             font_encoding_rev = {v: bytes((k,)) for k, v in font.encoding.items()}

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ cryptodome = ["PyCryptodome"]
 image = ["Pillow>=8.0.0"]
 full = [
     "cryptography",
+    "fonttools",
     "Pillow>=8.0.0"
 ]
 dev = [

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -8,7 +8,15 @@
 from pypdf._codecs import charset_encoding
 from pypdf._font import Font
 from pypdf.errors import LimitReachedError
-from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject, StreamObject
+from pypdf.generic import (
+    ArrayObject,
+    DictionaryObject,
+    EncodedStreamObject,
+    IndirectObject,
+    NameObject,
+    NullObject,
+    StreamObject,
+)
 
 from . import RESOURCE_ROOT, get_data_from_url
 
@@ -135,6 +143,8 @@ def test_iss1533():
     reader.pages[0].extract_text()  # no error
     font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"])
     assert font.character_map["\x01"] == "Ü"
+    assert isinstance(font.font_descriptor.font_file, EncodedStreamObject)
+    assert font.font_descriptor.font_file["/Subtype"] == "/CIDFontType0C"
 
 
 @pytest.mark.enable_socket