Skip to content

Commit

Permalink
ENH: Accept utf strings for metadata (#2802)
Browse files Browse the repository at this point in the history
Closes #2754.
  • Loading branch information
pubpub-zz authored Aug 16, 2024
1 parent 454a62a commit 0c81f3c
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 12 deletions.
25 changes: 23 additions & 2 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,23 +517,38 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
autodetect_pdfdocencoding: bool
autodetect_utf16: bool
utf16_bom: bytes
_original_bytes: Optional[bytes] = None

def __new__(cls, value: Any) -> "TextStringObject":
org = None
if isinstance(value, bytes):
org = value
value = value.decode("charmap")
o = str.__new__(cls, value)
o._original_bytes = org
o.autodetect_utf16 = False
o.autodetect_pdfdocencoding = False
o.utf16_bom = b""
if value.startswith(("\xfe\xff", "\xff\xfe")):
assert org is not None # for mypy
try:
o = str.__new__(cls, org.decode("utf-16"))
except UnicodeDecodeError as exc:
logger_warning(
f"{exc!s}\ninitial string:{exc.object!r}",
__name__,
)
o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
o._original_bytes = org
o.autodetect_utf16 = True
o.utf16_bom = value[:2].encode("charmap")
o.utf16_bom = org[:2]
else:
try:
encode_pdfdocencoding(o)
o.autodetect_pdfdocencoding = True
except UnicodeEncodeError:
o.autodetect_utf16 = True
o.utf16_bom = codecs.BOM_UTF16_BE
return o

def clone(
Expand All @@ -544,6 +559,7 @@ def clone(
) -> "TextStringObject":
"""Clone object into pdf_dest."""
obj = TextStringObject(self)
obj._original_bytes = self._original_bytes
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
obj.autodetect_utf16 = self.autodetect_utf16
obj.utf16_bom = self.utf16_bom
Expand All @@ -559,7 +575,10 @@ def original_bytes(self) -> bytes:
if that occurs, this "original_bytes" property can be used to
back-calculate what the original encoded bytes were.
"""
return self.get_original_bytes()
if self._original_bytes is not None:
return self._original_bytes
else:
return self.get_original_bytes()

def get_original_bytes(self) -> bytes:
# We're a text string object, but the library is trying to get our raw
Expand All @@ -584,6 +603,8 @@ def get_encoded_bytes(self) -> bytes:
# nicer to look at in the PDF file. Sadly, we take a performance hit
# here for trying...
try:
if self._original_bytes is not None:
return self._original_bytes
if self.autodetect_utf16:
raise UnicodeEncodeError("", "forced", -1, -1, "")
bytearr = encode_pdfdocencoding(self)
Expand Down
38 changes: 28 additions & 10 deletions pypdf/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,27 +148,45 @@ def create_string_object(
out += forced_encoding[x]
except Exception:
out += bytes((x,)).decode("charmap")
return TextStringObject(out)
obj = TextStringObject(out)
obj._original_bytes = string
return obj
elif isinstance(forced_encoding, str):
if forced_encoding == "bytes":
return ByteStringObject(string)
return TextStringObject(string.decode(forced_encoding))
obj = TextStringObject(string.decode(forced_encoding))
obj._original_bytes = string
return obj
else:
try:
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
retval = TextStringObject(string.decode("utf-16"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = string[:2]
return retval
else:
# This is probably a big performance hit here, but we need
# to convert string objects into the text/unicode-aware
# version if possible... and the only way to check if that's
# possible is to try.
# Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval.autodetect_pdfdocencoding = True
if string.startswith(b"\x00"):
retval = TextStringObject(string.decode("utf-16be"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_BE
return retval
if string[1:2] == b"\x00":
retval = TextStringObject(string.decode("utf-16le"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_LE
return retval

# This is probably a big performance hit here, but we need
# to convert string objects into the text/unicode-aware
# version if possible... and the only way to check if that's
# possible is to try.
# Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval._original_bytes = string
retval.autodetect_pdfdocencoding = True
return retval
except UnicodeDecodeError:
return ByteStringObject(string)
else:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ def test_textstringobject_autodetect_utf16():
tso.autodetect_utf16 = True
tso.utf16_bom = codecs.BOM_UTF16_BE
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
tso.utf16_bom = codecs.BOM_UTF16_LE
assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00"
assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00"


def test_remove_child_not_in_tree():
Expand Down Expand Up @@ -1131,6 +1134,16 @@ def test_create_string_object_utf16_bom():
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
result = TextStringObject(
b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xff\xfe"
assert (
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)

# utf16-be without bom
result = TextStringObject("ÿ")
Expand Down
21 changes: 21 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2333,3 +2333,24 @@ def test_set_need_appearances_writer():
"""Minimal test for coverage"""
writer = PdfWriter()
writer.set_need_appearances_writer()


def test_utf16_metadata():
"""See #2754"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
writer.add_metadata(
{
"/Subject": "Invoice №AI_047",
}
)
b = BytesIO()
writer.write(b)
b.seek(0)
reader = PdfReader(b)
assert reader.metadata.subject == "Invoice №AI_047"
bb = b.getvalue()
i = bb.find(b"/Subject")
assert bb[i : i + 100] == (
b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e"
b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)"
)

0 comments on commit 0c81f3c

Please sign in to comment.