File tree Expand file tree Collapse file tree 4 files changed +28
-1
lines changed
Expand file tree Collapse file tree 4 files changed +28
-1
lines changed Original file line number Diff line number Diff line change 2525 python-version : ${{ matrix.python-version }}
2626 - name : Change apt mirror
2727 run : ${GITHUB_WORKSPACE}/.github/scripts/change_apt_mirror.sh
28+ - name : Install prerequisites
29+ run : |
30+ set -euo pipefail
31+ sudo apt-get -y install libcairo2-dev pkg-config python3-dev
2832 - name : Create virtualenv
2933 run : ${GITHUB_WORKSPACE}/.github/scripts/create_virtualenv.sh
3034 - name : Install test Python packages
Original file line number Diff line number Diff line change @@ -1148,7 +1148,7 @@ def convert_html_to_text(
11481148 # https://bugs.launchpad.net/beautifulsoup/+bug/2110492
11491149 # beautifulsoup4==4.13.4 returns "b''" for an empty bytes array
11501150 # So we just workaround this here:
1151- if bytes is not None and len (blob ) == 0 :
1151+ if blob is not None and len (blob ) == 0 :
11521152 return ""
11531153
11541154 with get_filelikeobject (filename , blob ) as fp :
Original file line number Diff line number Diff line change @@ -217,6 +217,26 @@ def test_htm_converted(self) -> None:
217217 )
218218 self .assertEqual (text .strip (), content )
219219
220+ def test_htm_file_converted (self ) -> None :
221+ content = self .fake .paragraph (nb_sentences = 10 )
222+
223+ html = f"""
224+ <!DOCTYPE html>
225+ <html>
226+ <head>
227+ </head>
228+ <body>
229+ { content }
230+ </body>
231+ </html>
232+ """
233+ with NamedTemporaryFile (suffix = ".htm" , delete = False ) as temp_file :
234+ temp_file .write (html .encode ("utf-8" ))
235+ temp_file .close ()
236+ text = document_to_text (filename = temp_file .name )
237+
238+ self .assertEqual (text .strip (), content )
239+
220240 def test_empty_htm_converted (self ) -> None :
221241 text = document_to_text (
222242 blob = "" .encode ("utf-8" ), extension = "htm" , config = self .config
Original file line number Diff line number Diff line change @@ -914,3 +914,6 @@ Quick links:
914914- ``cardinalpythonlib_grep_in_openxml ``: new facility to search XML node text
915915 (rather than raw file text), and this is now the default. Also, behind the
916916 scenes, exceptions in subprocesses are now reported.
917+
918+ - Fix extraction of text from HTML files in
919+ :func: `cardinal_pythonlib.extract_text.document_to_text `.
You can’t perform that action at this time.
0 commit comments