Skip to content

Commit de72344

Browse files
Allow blobs to be empty when extracting text
It is possible to have an email with an empty body. Other scenarios (empty HTML, docx etc) are pretty unlikely
1 parent 5fb204f commit de72344

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ def get_filelikeobject(filename: str = None, blob: bytes = None) -> BinaryIO:
358358
Returns:
359359
a :class:`BinaryIO` object
360360
"""
361-
if not filename and not blob:
361+
if filename is None and blob is None:
362362
raise ValueError("no filename and no blob")
363363
if filename and blob:
364364
raise ValueError("specify either filename or blob")
@@ -373,11 +373,11 @@ def get_file_contents(filename: str = None, blob: bytes = None) -> bytes:
373373
"""
374374
Returns the binary contents of a file, or of a BLOB.
375375
"""
376-
if not filename and not blob:
376+
if filename is None and blob is None:
377377
raise ValueError("no filename and no blob")
378378
if filename and blob:
379379
raise ValueError("specify either filename or blob")
380-
if blob:
380+
if blob is not None:
381381
return blob
382382
with open(filename, "rb") as f:
383383
return f.read()
@@ -1408,7 +1408,7 @@ def document_to_text(
14081408
Raises an exception for malformed arguments, missing files, bad
14091409
filetypes, etc.
14101410
"""
1411-
if not filename and not blob:
1411+
if filename is None and blob is None:
14121412
raise ValueError("document_to_text: no filename and no blob")
14131413
if filename and blob:
14141414
raise ValueError("document_to_text: specify either filename or blob")

cardinal_pythonlib/tests/extract_text_tests.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ def test_htm_converted(self) -> None:
204204
)
205205
self.assertEqual(text.strip(), content)
206206

207+
def test_empty_htm_converted(self) -> None:
208+
text = document_to_text(
209+
blob="".encode("utf-8"), extension="htm", config=self.config
210+
)
211+
self.assertEqual(text, "")
212+
207213
def test_log_converted(self) -> None:
208214
content = """
209215
2025-04-02 06:05:43,772 INFO Starting unattended upgrades script
@@ -477,6 +483,25 @@ def test_eml_with_no_content_type_converted(self) -> None:
477483

478484
self.assertIn(text_content, text)
479485

486+
def test_eml_with_empty_body_converted(self) -> None:
487+
content = """From: bar@example.org
488+
Subject: No body
489+
To: foo@example.org
490+
Mime-Version: 1.0
491+
Content-Type: multipart/mixed;boundary="==="
492+
493+
--===
494+
--===--
495+
"""
496+
message = message_from_string(content, policy=policy.default)
497+
blob = message.as_bytes()
498+
499+
text = document_to_text(
500+
blob=blob, extension=".eml", config=self.config
501+
)
502+
503+
self.assertEqual("", text)
504+
480505
def test_unsupported_converted(self) -> None:
481506
with mock.patch.multiple(
482507
"cardinal_pythonlib.extract_text.subprocess",

0 commit comments

Comments
 (0)