Skip to content
16 changes: 15 additions & 1 deletion pypdf/_font.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from dataclasses import dataclass, field
from typing import Any, Optional, Union, cast

from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, StreamObject

from ._cmap import get_encoding
from ._codecs.adobe_glyphs import adobe_glyphs
from ._utils import logger_warning
from .errors import PdfReadError


@dataclass(frozen=True)
Expand All @@ -32,6 +33,7 @@ class FontDescriptor:
bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0))

character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500})
font_file: Union[StreamObject, None] = None

@staticmethod
def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]:
Expand Down Expand Up @@ -59,6 +61,18 @@ def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: Dic
bbox_tuple = tuple(map(float, font_kwargs["bbox"]))
assert len(bbox_tuple) == 4, bbox_tuple
font_kwargs["bbox"] = bbox_tuple
# Find the binary stream for this font if there is one
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Find the binary stream for this font if there is one
# Find the binary stream for this font if there is one

for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]:
if source_key in font_descriptor_dict:
if "font_file" in font_kwargs:
raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}")

try:
font_file = font_descriptor_dict[source_key].get_object()
font_kwargs["font_file"] = font_file
except PdfReadError as e:
logger_warning(f"Failed to get '{source_key}' in {font_descriptor_dict}: {e}", __name__)

return font_kwargs

@staticmethod
Expand Down
4 changes: 3 additions & 1 deletion tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pypdf._cmap import get_encoding, parse_bfchar
from pypdf._codecs import charset_encoding
from pypdf._font import Font
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject
from pypdf.generic import ArrayObject, DictionaryObject, EncodedStreamObject, IndirectObject, NameObject, NullObject

from . import get_data_from_url

Expand Down Expand Up @@ -139,6 +139,8 @@ def test_iss1533():
reader.pages[0].extract_text() # no error
font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"])
assert font.character_map["\x01"] == "Ü"
assert type(font.font_descriptor.font_file) is EncodedStreamObject
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert type(font.font_descriptor.font_file) is EncodedStreamObject
assert isinstance(font.font_descriptor.font_file, EncodedStreamObject)

assert font.font_descriptor.font_file["/Subtype"] == "/CIDFontType0C"


@pytest.mark.enable_socket
Expand Down
39 changes: 37 additions & 2 deletions tests/test_font.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
"""Test font-related functionality."""
from pathlib import Path

from pypdf._font import FontDescriptor
from pypdf.generic import DictionaryObject, NameObject
import pytest

from pypdf import PdfReader
from pypdf._font import Font, FontDescriptor
from pypdf.errors import PdfReadError
from pypdf.generic import DictionaryObject, EncodedStreamObject, NameObject

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"


def test_font_descriptor():
Expand All @@ -28,3 +37,29 @@ def test_font_descriptor():
assert my_font.italic_angle == 0
assert my_font.flags == 33
assert my_font.bbox == (-113.0, -250.0, 749.0, 801.0)


def test_font_file():
reader = PdfReader(RESOURCE_ROOT / "multilang.pdf")

# /FontFile
font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F2"])
assert type(font.font_descriptor.font_file) is EncodedStreamObject
assert len(font.font_descriptor.font_file.get_data()) == 5116

# /FontFile2
font_resource = reader.pages[0]["/Resources"]["/Font"]["/F1"]
font = Font.from_font_resource(font_resource)
assert type(font.font_descriptor.font_file) is EncodedStreamObject
assert len(font.font_descriptor.font_file.get_data()) == 28464

with pytest.raises(PdfReadError) as exception:
Copy link
Collaborator

@stefan6419846 stefan6419846 Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with pytest.raises(PdfReadError) as exception:
with pytest.raises(PdfReadError, match=r"^More than one /FontFile found in .+$"):

font_resource[NameObject("/FontDescriptor")][NameObject("/FontFile")] = NameObject("xyz")
font = Font.from_font_resource(font_resource)
assert "More than one /FontFile" in exception.value.args[0]

# /FontFile3
reader = PdfReader(RESOURCE_ROOT / "attachment.pdf")
font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F1"])
assert type(font.font_descriptor.font_file) is EncodedStreamObject
assert len(font.font_descriptor.font_file.get_data()) == 2168
1 change: 1 addition & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def test_font_class_to_dict():
"x_height": 500.0,
"italic_angle": 0.0,
"flags": 32,
"font_file": None,
"bbox": (
-100.0,
-200.0,
Expand Down