Skip to content

Commit dc92a17

Browse files
Replace illegal multibyte sequences when encoding emails
1 parent 499f994 commit dc92a17

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1299,7 +1299,7 @@ def _get_email_content(
12991299
content_type_header = message.get("Content-Type")
13001300
if content_type_header:
13011301
charset = content_type_header.params.get("charset", "utf-8")
1302-
blob = content.encode(charset)
1302+
blob = content.encode(charset, "replace")
13031303
elif isinstance(content, EmailMessage):
13041304
blob = content.as_bytes()
13051305
if message.get("Content-Transfer-Encoding") == "base64":

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,35 @@ def test_eml_with_empty_body_converted(self) -> None:
502502

503503
self.assertEqual("", text)
504504

505+
def test_eml_with_illegal_multibyte_sequence_replaced(self) -> None:
506+
content = """From: bar@example.org
507+
Subject: Illegal multibyte sequence
508+
To: foo@example.org
509+
Mime-Version: 1.0
510+
Content-Type: multipart/mixed;boundary="==="
511+
512+
--===
513+
Content-Type: text/html; charset="big5"
514+
Content-Transfer-Encoding: quoted-printable
515+
516+
<html><head>
517+
<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dbig5">
518+
</head>
519+
<body>
520+
=F9=F9
521+
</body>
522+
</html>
523+
--===--
524+
"""
525+
message = message_from_string(content, policy=policy.default)
526+
blob = message.as_bytes()
527+
528+
text = document_to_text(
529+
blob=blob, extension=".eml", config=self.config
530+
)
531+
532+
self.assertEqual(text.strip(), "??")
533+
505534
def test_unsupported_converted(self) -> None:
506535
with mock.patch.multiple(
507536
"cardinal_pythonlib.extract_text.subprocess",

0 commit comments

Comments
 (0)