Skip to content

Commit f11f8c0

Browse files
blapdolfim-ibm
andauthored
feat: Add Tesseract PSM options support (#2411)
* feat: Add Tesseract PSM options support Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> * apply formatting Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add tesseract_cli in checks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Bruno Pio <913963+blap@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
1 parent ee55013 commit f11f8c0

File tree

4 files changed

+38
-5
lines changed

4 files changed

+38
-5
lines changed

docling/cli/main.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
PipelineOptions,
5858
ProcessingPipeline,
5959
TableFormerMode,
60+
TesseractCliOcrOptions,
61+
TesseractOcrOptions,
6062
VlmPipelineOptions,
6163
)
6264
from docling.datamodel.settings import settings
@@ -380,6 +382,13 @@ def convert( # noqa: C901
380382
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
381383
),
382384
] = None,
385+
psm: Annotated[
386+
Optional[int],
387+
typer.Option(
388+
...,
389+
help="Page Segmentation Mode for the OCR engine (0-13).",
390+
),
391+
] = None,
383392
pdf_backend: Annotated[
384393
PdfBackend, typer.Option(..., help="The PDF backend to use.")
385394
] = PdfBackend.DLPARSE_V2,
@@ -596,6 +605,10 @@ def convert( # noqa: C901
596605
ocr_lang_list = _split_list(ocr_lang)
597606
if ocr_lang_list is not None:
598607
ocr_options.lang = ocr_lang_list
608+
if psm is not None and isinstance(
609+
ocr_options, (TesseractOcrOptions, TesseractCliOcrOptions)
610+
):
611+
ocr_options.psm = psm
599612

600613
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
601614
# pipeline_options: PaginatedPipelineOptions

docling/datamodel/pipeline_options.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ class TesseractCliOcrOptions(OcrOptions):
154154
lang: List[str] = ["fra", "deu", "spa", "eng"]
155155
tesseract_cmd: str = "tesseract"
156156
path: Optional[str] = None
157+
psm: Optional[int] = (
158+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
159+
)
157160

158161
model_config = ConfigDict(
159162
extra="forbid",
@@ -166,6 +169,9 @@ class TesseractOcrOptions(OcrOptions):
166169
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
167170
lang: List[str] = ["fra", "deu", "spa", "eng"]
168171
path: Optional[str] = None
172+
psm: Optional[int] = (
173+
None # Page Segmentation Mode (0-13), defaults to tesseract's default
174+
)
169175

170176
model_config = ConfigDict(
171177
extra="forbid",

docling/models/tesseract_ocr_cli_model.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]):
117117
cmd.append("--tessdata-dir")
118118
cmd.append(self.options.path)
119119

120+
# Add PSM option if specified in the configuration
121+
if self.options.psm is not None:
122+
cmd.extend(["--psm", str(self.options.psm)])
123+
120124
cmd += [ifilename, "stdout", "tsv"]
121125
_log.info("command: {}".format(" ".join(cmd)))
122126

docling/models/tesseract_ocr_model.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ def __init__(
8686
self.script_prefix = ""
8787

8888
tesserocr_kwargs = {
89-
"psm": tesserocr.PSM.AUTO,
9089
"init": True,
9190
"oem": tesserocr.OEM.DEFAULT,
9291
}
@@ -96,14 +95,23 @@ def __init__(
9695
if self.options.path is not None:
9796
tesserocr_kwargs["path"] = self.options.path
9897

98+
# Set main OCR reader with configurable PSM
99+
main_psm = (
100+
tesserocr.PSM(self.options.psm)
101+
if self.options.psm is not None
102+
else tesserocr.PSM.AUTO
103+
)
99104
if lang == "auto":
100-
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
105+
self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
101106
else:
102107
self.reader = tesserocr.PyTessBaseAPI(
103-
**{"lang": lang} | tesserocr_kwargs,
108+
lang=lang,
109+
psm=main_psm,
110+
**tesserocr_kwargs,
104111
)
112+
# OSD reader must use PSM.OSD_ONLY for orientation detection
105113
self.osd_reader = tesserocr.PyTessBaseAPI(
106-
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
114+
lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
107115
)
108116
self.reader_RIL = tesserocr.RIL
109117

@@ -187,7 +195,9 @@ def __call__(
187195
tesserocr.PyTessBaseAPI(
188196
path=self.reader.GetDatapath(),
189197
lang=lang,
190-
psm=tesserocr.PSM.AUTO,
198+
psm=tesserocr.PSM(self.options.psm)
199+
if self.options.psm is not None
200+
else tesserocr.PSM.AUTO,
191201
init=True,
192202
oem=tesserocr.OEM.DEFAULT,
193203
)

0 commit comments

Comments
 (0)