From e247e96d2d746912a1e8cac94c73a6f2e1d458cd Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sun, 11 Aug 2024 07:52:13 -0400 Subject: [PATCH] Update test_tesseract.py Funtionality cannot be tested in GitHub actions because no Tesseract is installed. Correctly locate tessdata from installation folder If tesseract-ocr is not version 4.00 in a Unix-like platform, we incorrectly used pathlib.Path for locating tessdata. This fix correctly determines the tessdata folder name for any version of tesseract-ocr. Note however, that we cannot guarantee that MuPDF's OCR code (which is on version 4.00) can cope with tessdata content contained in a different version than 4.00. --- src/__init__.py | 29 ++++++++++------------------- tests/test_tesseract.py | 1 + 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index f715c072d..82d3b9908 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -18041,7 +18041,7 @@ def make_utf16be(s): return "(" + r + ")" -def get_tessdata() -> str: +def get_tessdata(): """Detect Tesseract-OCR and return its language support folder. This function can be used to enable OCR via Tesseract even if the @@ -18053,17 +18053,9 @@ def get_tessdata() -> str: Folder name of tessdata if Tesseract-OCR is available, otherwise False. """ TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX") - if TESSDATA_PREFIX is not None: + if TESSDATA_PREFIX: # use environment variable if set return TESSDATA_PREFIX - if sys.platform == "win32": - tessdata = "C:\\Program Files\\Tesseract-OCR\\tessdata" - else: - tessdata = "/usr/share/tesseract-ocr/4.00/tessdata" - - if os.path.exists(tessdata): - return tessdata - """ Try to locate the tesseract-ocr installation. """ @@ -18090,18 +18082,17 @@ def get_tessdata() -> str: message("Tesseract-OCR is not installed") return False - # determine tessdata via iteration over subfolders - tessdata = None - for sub_response in response.iterdir(): - for sub_sub in sub_response.iterdir(): - if str(sub_sub).endswith("tessdata"): - tessdata = sub_sub - break - if tessdata is not None: + # search tessdata in folder structure + dirname = response[1] # contains tesseract-ocr installation folder + if isinstance(dirname, bytes): # has probably 'bytes' format + dirname = dirname.decode() + dirname = pathlib.Path(dirname) # make a Path from it + sub = list(dirname.iterdir())[0] # includes the version sub-folder + tessdata = str(list(sub.iterdir())[0]) # this should be the tessdata + if tessdata.endswith("tessdata"): # expected name suffix return tessdata else: message("unexpected: tesseract-ocr has no 'tessdata' folder") - return False return False diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index f8b08d2c7..92212b98f 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -65,3 +65,4 @@ def test_tesseract(): 'dropping unclosed output' ) +