Skip to content

Commit

Permalink
Update test_tesseract.py
Browse files Browse the repository at this point in the history
Funtionality cannot be tested in GitHub actions because no Tesseract is installed.

Correctly locate tessdata from installation folder

If tesseract-ocr is not version 4.00 in a Unix-like platform, we incorrectly used pathlib.Path for locating tessdata.
This fix correctly determines the tessdata folder name for any version of tesseract-ocr.
Note however, that we cannot guarantee that MuPDF's OCR code (which is on version 4.00) can cope with tessdata content contained in a different version than 4.00.
  • Loading branch information
JorjMcKie committed Aug 16, 2024
1 parent eca7066 commit e247e96
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 19 deletions.
29 changes: 10 additions & 19 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18041,7 +18041,7 @@ def make_utf16be(s):
return "(" + r + ")"


def get_tessdata() -> str:
def get_tessdata():
"""Detect Tesseract-OCR and return its language support folder.

This function can be used to enable OCR via Tesseract even if the
Expand All @@ -18053,17 +18053,9 @@ def get_tessdata() -> str:
Folder name of tessdata if Tesseract-OCR is available, otherwise False.
"""
TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX")
if TESSDATA_PREFIX is not None:
if TESSDATA_PREFIX: # use environment variable if set
return TESSDATA_PREFIX

if sys.platform == "win32":
tessdata = "C:\\Program Files\\Tesseract-OCR\\tessdata"
else:
tessdata = "/usr/share/tesseract-ocr/4.00/tessdata"

if os.path.exists(tessdata):
return tessdata

"""
Try to locate the tesseract-ocr installation.
"""
Expand All @@ -18090,18 +18082,17 @@ def get_tessdata() -> str:
message("Tesseract-OCR is not installed")
return False

# determine tessdata via iteration over subfolders
tessdata = None
for sub_response in response.iterdir():
for sub_sub in sub_response.iterdir():
if str(sub_sub).endswith("tessdata"):
tessdata = sub_sub
break
if tessdata is not None:
# search tessdata in folder structure
dirname = response[1] # contains tesseract-ocr installation folder
if isinstance(dirname, bytes): # has probably 'bytes' format
dirname = dirname.decode()
dirname = pathlib.Path(dirname) # make a Path from it
sub = list(dirname.iterdir())[0] # includes the version sub-folder
tessdata = str(list(sub.iterdir())[0]) # this should be the tessdata
if tessdata.endswith("tessdata"): # expected name suffix
return tessdata
else:
message("unexpected: tesseract-ocr has no 'tessdata' folder")
return False
return False


Expand Down
1 change: 1 addition & 0 deletions tests/test_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ def test_tesseract():
'dropping unclosed output'
)


0 comments on commit e247e96

Please sign in to comment.