File tree Expand file tree Collapse file tree 2 files changed +30
-1
lines changed
Expand file tree Collapse file tree 2 files changed +30
-1
lines changed Original file line number Diff line number Diff line change @@ -1299,7 +1299,7 @@ def _get_email_content(
12991299 content_type_header = message .get ("Content-Type" )
13001300 if content_type_header :
13011301 charset = content_type_header .params .get ("charset" , "utf-8" )
1302- blob = content .encode (charset )
1302+ blob = content .encode (charset , "replace" )
13031303 elif isinstance (content , EmailMessage ):
13041304 blob = content .as_bytes ()
13051305 if message .get ("Content-Transfer-Encoding" ) == "base64" :
Original file line number Diff line number Diff line change @@ -502,6 +502,35 @@ def test_eml_with_empty_body_converted(self) -> None:
502502
503503 self .assertEqual ("" , text )
504504
505+ def test_eml_with_illegal_multibyte_sequence_replaced (self ) -> None :
506+ content = """From: bar@example.org
507+ Subject: Illegal multibyte sequence
508+ To: foo@example.org
509+ Mime-Version: 1.0
510+ Content-Type: multipart/mixed;boundary="==="
511+
512+ --===
513+ Content-Type: text/html; charset="big5"
514+ Content-Transfer-Encoding: quoted-printable
515+
516+ <html><head>
517+ <meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dbig5">
518+ </head>
519+ <body>
520+ =F9=F9
521+ </body>
522+ </html>
523+ --===--
524+ """
525+ message = message_from_string (content , policy = policy .default )
526+ blob = message .as_bytes ()
527+
528+ text = document_to_text (
529+ blob = blob , extension = ".eml" , config = self .config
530+ )
531+
532+ self .assertEqual (text .strip (), "??" )
533+
505534 def test_unsupported_converted (self ) -> None :
506535 with mock .patch .multiple (
507536 "cardinal_pythonlib.extract_text.subprocess" ,
You can’t perform that action at this time.
0 commit comments