Merge pull request #42 from RudolfCardinal/extract-text-fixes

martinburchell · web-flow · commit cad5e2ffbb93 · 2026-01-26T22:18:40.000Z
Fix extraction of text from HTML files
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -25,6 +25,10 @@ jobs:
                   python-version: ${{ matrix.python-version }}
             - name: Change apt mirror
               run: ${GITHUB_WORKSPACE}/.github/scripts/change_apt_mirror.sh
+            - name: Install prerequisites
+              run: |
+                  set -euo pipefail
+                  sudo apt-get -y install libcairo2-dev pkg-config python3-dev
             - name: Create virtualenv
               run: ${GITHUB_WORKSPACE}/.github/scripts/create_virtualenv.sh
             - name: Install test Python packages
diff --git a/cardinal_pythonlib/extract_text.py b/cardinal_pythonlib/extract_text.py
@@ -1148,7 +1148,7 @@ def convert_html_to_text(
     # https://bugs.launchpad.net/beautifulsoup/+bug/2110492
     # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
     # So we just workaround this here:
-    if bytes is not None and len(blob) == 0:
+    if blob is not None and len(blob) == 0:
         return ""
 
     with get_filelikeobject(filename, blob) as fp:
diff --git a/cardinal_pythonlib/tests/extract_text_tests.py b/cardinal_pythonlib/tests/extract_text_tests.py
@@ -217,6 +217,26 @@ def test_htm_converted(self) -> None:
         )
         self.assertEqual(text.strip(), content)
 
+    def test_htm_file_converted(self) -> None:
+        content = self.fake.paragraph(nb_sentences=10)
+
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+{content}
+</body>
+</html>
+"""
+        with NamedTemporaryFile(suffix=".htm", delete=False) as temp_file:
+            temp_file.write(html.encode("utf-8"))
+            temp_file.close()
+            text = document_to_text(filename=temp_file.name)
+
+        self.assertEqual(text.strip(), content)
+
     def test_empty_htm_converted(self) -> None:
         text = document_to_text(
             blob="".encode("utf-8"), extension="htm", config=self.config
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -914,3 +914,6 @@ Quick links:
 - ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text
   (rather than raw file text), and this is now the default. Also, behind the
   scenes, exceptions in subprocesses are now reported.
+
+- Fix extraction of text from HTML files in
+  :func:`cardinal_pythonlib.extract_text.document_to_text`.