Workaround BeautifulSoup not handling empty byte array correctly

martinburchell · martinburchell · commit bdc9983e1cb1 · 2025-05-12T16:24:52.000+01:00
diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
@@ -1140,6 +1140,12 @@ def convert_html_to_text(
     """
     Converts HTML to text.
     """
+
+    # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
+    # So we just workaround this here:
+    if bytes is not None and len(blob) == 0:
+        return ""
+
     with get_filelikeobject(filename, blob) as fp:
         soup = bs4.BeautifulSoup(fp, "html.parser")
         return soup.get_text()