Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,8 @@ def get_prediction_provider(
docling_layout_model_spec: Optional[LayoutModelConfig] = None,
docling_layout_create_orphan_clusters: Optional[bool] = None,
docling_layout_keep_empty_clusters: Optional[bool] = None,
# Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
docling_force_full_page_ocr: Optional[bool] = None,
):
pipeline_options: PaginatedPipelineOptions
Expand Down Expand Up @@ -431,6 +433,14 @@ def get_prediction_provider(
pdf_pipeline_options.generate_page_images = True
pdf_pipeline_options.generate_picture_images = True

# Only for programmatic Docling (PDF), optionally control orphan text cells
if docling_programmatic_add_orphan_text_cells is not None:
layout_options_prog = LayoutOptions()
layout_options_prog.create_orphan_clusters = (
docling_programmatic_add_orphan_text_cells
)
pdf_pipeline_options.layout_options = layout_options_prog

ocr_pipeline_options = PdfPipelineOptions(
do_ocr=True,
ocr_options=ocr_options, # we need to provide OCR options in order to not break the parquet serialization
Expand Down Expand Up @@ -1169,6 +1179,15 @@ def create_eval(
Optional[bool],
typer.Option(help="Keep the empty clusters in Docling layout post-processing"),
] = False,
programmatic_add_orphan_text_cells: Annotated[
bool,
typer.Option(
help=(
"Add orphan text cells for programmatic Docling pipeline (PDF_DOCLING). "
"Defaults to False."
)
),
] = False,
do_visualization: Annotated[
bool, typer.Option(help="visualize the predictions")
] = True,
Expand Down Expand Up @@ -1226,6 +1245,7 @@ def create_eval(
docling_layout_model_spec=docling_layout_model_spec_obj,
docling_layout_create_orphan_clusters=docling_layout_create_orphan_clusters,
docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
docling_programmatic_add_orphan_text_cells=programmatic_add_orphan_text_cells,
docling_force_full_page_ocr=docling_force_full_page_ocr,
)

Expand Down Expand Up @@ -1285,6 +1305,15 @@ def create(
bool,
typer.Option(help="Force OCR on entire page (only Docling OCR providers)"),
] = False,
programmatic_add_orphan_text_cells: Annotated[
bool,
typer.Option(
help=(
"Add orphan text cells for programmatic Docling pipeline (PDF_DOCLING). "
"Defaults to False."
)
),
] = False,
):
"""Create both ground truth and evaluation datasets in one step."""
# First create ground truth
Expand Down Expand Up @@ -1315,6 +1344,7 @@ def create(
image_scale_factor=image_scale_factor,
do_table_structure=do_table_structure,
docling_force_full_page_ocr=docling_force_full_page_ocr,
programmatic_add_orphan_text_cells=programmatic_add_orphan_text_cells,
)
else:
_log.info(
Expand Down
Loading