Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: accept utf strings for metadata #2802

Merged
merged 5 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,23 +517,38 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
autodetect_pdfdocencoding: bool
autodetect_utf16: bool
utf16_bom: bytes
_original_bytes: Optional[bytes] = None

def __new__(cls, value: Any) -> "TextStringObject":
org = None
if isinstance(value, bytes):
org = value
value = value.decode("charmap")
o = str.__new__(cls, value)
o._original_bytes = org
o.autodetect_utf16 = False
o.autodetect_pdfdocencoding = False
o.utf16_bom = b""
if value.startswith(("\xfe\xff", "\xff\xfe")):
assert org is not None # for mypy
try:
o = str.__new__(cls, org.decode("utf-16"))
except UnicodeDecodeError as exc:
logger_warning(
f"{exc!s}\ninitial string:{exc.object!r}",
__name__,
)
o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
o._original_bytes = org
o.autodetect_utf16 = True
o.utf16_bom = value[:2].encode("charmap")
o.utf16_bom = org[:2]
else:
try:
encode_pdfdocencoding(o)
o.autodetect_pdfdocencoding = True
except UnicodeEncodeError:
o.autodetect_utf16 = True
o.utf16_bom = codecs.BOM_UTF16_BE
return o

def clone(
Expand All @@ -544,6 +559,7 @@ def clone(
) -> "TextStringObject":
"""Clone object into pdf_dest."""
obj = TextStringObject(self)
obj._original_bytes = self._original_bytes
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
obj.autodetect_utf16 = self.autodetect_utf16
obj.utf16_bom = self.utf16_bom
Expand All @@ -559,7 +575,10 @@ def original_bytes(self) -> bytes:
if that occurs, this "original_bytes" property can be used to
back-calculate what the original encoded bytes were.
"""
return self.get_original_bytes()
if self._original_bytes is not None:
return self._original_bytes
else:
return self.get_original_bytes()

def get_original_bytes(self) -> bytes:
# We're a text string object, but the library is trying to get our raw
Expand All @@ -584,6 +603,8 @@ def get_encoded_bytes(self) -> bytes:
# nicer to look at in the PDF file. Sadly, we take a performance hit
# here for trying...
try:
if self._original_bytes is not None:
return self._original_bytes
if self.autodetect_utf16:
raise UnicodeEncodeError("", "forced", -1, -1, "")
bytearr = encode_pdfdocencoding(self)
Expand Down
38 changes: 28 additions & 10 deletions pypdf/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,27 +148,45 @@ def create_string_object(
out += forced_encoding[x]
except Exception:
out += bytes((x,)).decode("charmap")
return TextStringObject(out)
obj = TextStringObject(out)
obj._original_bytes = string
return obj
elif isinstance(forced_encoding, str):
if forced_encoding == "bytes":
return ByteStringObject(string)
return TextStringObject(string.decode(forced_encoding))
obj = TextStringObject(string.decode(forced_encoding))
obj._original_bytes = string
return obj
else:
try:
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
retval = TextStringObject(string.decode("utf-16"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = string[:2]
return retval
else:
# This is probably a big performance hit here, but we need
# to convert string objects into the text/unicode-aware
# version if possible... and the only way to check if that's
# possible is to try.
# Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval.autodetect_pdfdocencoding = True
if string.startswith(b"\x00"):
retval = TextStringObject(string.decode("utf-16be"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_BE
return retval
if string[1:2] == b"\x00":
retval = TextStringObject(string.decode("utf-16le"))
retval._original_bytes = string
retval.autodetect_utf16 = True
retval.utf16_bom = codecs.BOM_UTF16_LE
return retval

# This is probably a big performance hit here, but we need
# to convert string objects into the text/unicode-aware
# version if possible... and the only way to check if that's
# possible is to try.
# Some strings are strings, some are just byte arrays.
retval = TextStringObject(decode_pdfdocencoding(string))
retval._original_bytes = string
retval.autodetect_pdfdocencoding = True
return retval
except UnicodeDecodeError:
return ByteStringObject(string)
else:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ def test_textstringobject_autodetect_utf16():
tso.autodetect_utf16 = True
tso.utf16_bom = codecs.BOM_UTF16_BE
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
tso.utf16_bom = codecs.BOM_UTF16_LE
assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00"
assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00"


def test_remove_child_not_in_tree():
Expand Down Expand Up @@ -1131,6 +1134,16 @@ def test_create_string_object_utf16_bom():
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
result = TextStringObject(
b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)
assert result == "PaperPort 14\x00"
assert result.autodetect_utf16 is True
assert result.utf16_bom == b"\xff\xfe"
assert (
result.get_encoded_bytes()
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
)

# utf16-be without bom
result = TextStringObject("ÿ")
Expand Down
21 changes: 21 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2333,3 +2333,24 @@ def test_set_need_appearances_writer():
"""Minimal test for coverage"""
writer = PdfWriter()
writer.set_need_appearances_writer()


def test_utf16_metadata():
"""See #2754"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
writer.add_metadata(
{
"/Subject": "Invoice №AI_047",
}
)
b = BytesIO()
writer.write(b)
b.seek(0)
reader = PdfReader(b)
assert reader.metadata.subject == "Invoice №AI_047"
bb = b.getvalue()
i = bb.find(b"/Subject")
assert bb[i : i + 100] == (
b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e"
b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)"
)
Loading