Skip to content

Commit cd7f7ba

Browse files
authored
fix: Use proper page concatentation in VLM pipeline MD/HTML conversion (#2458)
* Use proper page concatentation in VLM pipeline MD/HTML conversion Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
1 parent 3687d86 commit cd7f7ba

File tree

1 file changed

+53
-33
lines changed

1 file changed

+53
-33
lines changed

docling/pipeline/vlm_pipeline.py

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from docling_core.types.doc import (
88
BoundingBox,
9+
ContentLayer,
910
DocItem,
1011
DoclingDocument,
1112
ImageRef,
@@ -251,9 +252,9 @@ def _extract_markdown_code(text):
251252
# No code blocks found, return original text
252253
return text
253254

254-
for pg_idx, page in enumerate(conv_res.pages):
255-
page_no = pg_idx + 1 # FIXME: might be incorrect
255+
page_docs = []
256256

257+
for pg_idx, page in enumerate(conv_res.pages):
257258
predicted_text = ""
258259
if page.predictions.vlm_response:
259260
predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -273,34 +274,43 @@ def _extract_markdown_code(text):
273274
)
274275
page_doc = backend.convert()
275276

277+
# Modify provenance in place for all items in the page document
278+
for item, level in page_doc.iterate_items(
279+
with_groups=True,
280+
traverse_pictures=True,
281+
included_content_layers=set(ContentLayer),
282+
):
283+
if isinstance(item, DocItem):
284+
item.prov = [
285+
ProvenanceItem(
286+
page_no=pg_idx + 1,
287+
bbox=BoundingBox(
288+
t=0.0, b=0.0, l=0.0, r=0.0
289+
), # FIXME: would be nice not to have to "fake" it
290+
charspan=[0, 0],
291+
)
292+
]
293+
294+
# Add page metadata to the page document before concatenation
276295
if page.image is not None:
277296
pg_width = page.image.width
278297
pg_height = page.image.height
279298
else:
280299
pg_width = 1
281300
pg_height = 1
282301

283-
conv_res.document.add_page(
284-
page_no=page_no,
302+
page_doc.add_page(
303+
page_no=pg_idx + 1,
285304
size=Size(width=pg_width, height=pg_height),
286305
image=ImageRef.from_pil(image=page.image, dpi=72)
287306
if page.image
288307
else None,
289308
)
290309

291-
for item, level in page_doc.iterate_items():
292-
item.prov = [
293-
ProvenanceItem(
294-
page_no=pg_idx + 1,
295-
bbox=BoundingBox(
296-
t=0.0, b=0.0, l=0.0, r=0.0
297-
), # FIXME: would be nice not to have to "fake" it
298-
charspan=[0, 0],
299-
)
300-
]
301-
conv_res.document.append_child_item(child=item)
310+
page_docs.append(page_doc)
302311

303-
return conv_res.document
312+
final_doc = DoclingDocument.concatenate(docs=page_docs)
313+
return final_doc
304314

305315
def _turn_html_into_doc(self, conv_res):
306316
def _extract_html_code(text):
@@ -328,9 +338,9 @@ def _extract_html_code(text):
328338
# No code blocks found, return original text
329339
return text
330340

331-
for pg_idx, page in enumerate(conv_res.pages):
332-
page_no = pg_idx + 1 # FIXME: might be incorrect
341+
page_docs = []
333342

343+
for pg_idx, page in enumerate(conv_res.pages):
334344
predicted_text = ""
335345
if page.predictions.vlm_response:
336346
predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -341,7 +351,7 @@ def _extract_html_code(text):
341351
out_doc = InputDocument(
342352
path_or_stream=response_bytes,
343353
filename=conv_res.input.file.name,
344-
format=InputFormat.MD,
354+
format=InputFormat.HTML,
345355
backend=HTMLDocumentBackend,
346356
)
347357
backend = HTMLDocumentBackend(
@@ -350,34 +360,44 @@ def _extract_html_code(text):
350360
)
351361
page_doc = backend.convert()
352362

363+
# Modify provenance in place for all items in the page document
364+
for item, level in page_doc.iterate_items(
365+
with_groups=True,
366+
traverse_pictures=True,
367+
included_content_layers=set(ContentLayer),
368+
):
369+
if isinstance(item, DocItem):
370+
item.prov = [
371+
ProvenanceItem(
372+
page_no=pg_idx + 1,
373+
bbox=BoundingBox(
374+
t=0.0, b=0.0, l=0.0, r=0.0
375+
), # FIXME: would be nice not to have to "fake" it
376+
charspan=[0, 0],
377+
)
378+
]
379+
380+
# Add page metadata to the page document before concatenation
353381
if page.image is not None:
354382
pg_width = page.image.width
355383
pg_height = page.image.height
356384
else:
357385
pg_width = 1
358386
pg_height = 1
359387

360-
conv_res.document.add_page(
361-
page_no=page_no,
388+
page_doc.add_page(
389+
page_no=pg_idx + 1,
362390
size=Size(width=pg_width, height=pg_height),
363391
image=ImageRef.from_pil(image=page.image, dpi=72)
364392
if page.image
365393
else None,
366394
)
367395

368-
for item, level in page_doc.iterate_items():
369-
item.prov = [
370-
ProvenanceItem(
371-
page_no=pg_idx + 1,
372-
bbox=BoundingBox(
373-
t=0.0, b=0.0, l=0.0, r=0.0
374-
), # FIXME: would be nice not to have to "fake" it
375-
charspan=[0, 0],
376-
)
377-
]
378-
conv_res.document.append_child_item(child=item)
396+
page_docs.append(page_doc)
379397

380-
return conv_res.document
398+
# Concatenate all page documents to preserve hierarchy
399+
final_doc = DoclingDocument.concatenate(docs=page_docs)
400+
return final_doc
381401

382402
@classmethod
383403
def get_default_options(cls) -> VlmPipelineOptions:

0 commit comments

Comments
 (0)