66
77from docling_core .types .doc import (
88 BoundingBox ,
9+ ContentLayer ,
910 DocItem ,
1011 DoclingDocument ,
1112 ImageRef ,
@@ -251,9 +252,9 @@ def _extract_markdown_code(text):
251252 # No code blocks found, return original text
252253 return text
253254
254- for pg_idx , page in enumerate (conv_res .pages ):
255- page_no = pg_idx + 1 # FIXME: might be incorrect
255+ page_docs = []
256256
257+ for pg_idx , page in enumerate (conv_res .pages ):
257258 predicted_text = ""
258259 if page .predictions .vlm_response :
259260 predicted_text = page .predictions .vlm_response .text + "\n \n "
@@ -273,34 +274,43 @@ def _extract_markdown_code(text):
273274 )
274275 page_doc = backend .convert ()
275276
277+ # Modify provenance in place for all items in the page document
278+ for item , level in page_doc .iterate_items (
279+ with_groups = True ,
280+ traverse_pictures = True ,
281+ included_content_layers = set (ContentLayer ),
282+ ):
283+ if isinstance (item , DocItem ):
284+ item .prov = [
285+ ProvenanceItem (
286+ page_no = pg_idx + 1 ,
287+ bbox = BoundingBox (
288+ t = 0.0 , b = 0.0 , l = 0.0 , r = 0.0
289+ ), # FIXME: would be nice not to have to "fake" it
290+ charspan = [0 , 0 ],
291+ )
292+ ]
293+
294+ # Add page metadata to the page document before concatenation
276295 if page .image is not None :
277296 pg_width = page .image .width
278297 pg_height = page .image .height
279298 else :
280299 pg_width = 1
281300 pg_height = 1
282301
283- conv_res . document .add_page (
284- page_no = page_no ,
302+ page_doc .add_page (
303+ page_no = pg_idx + 1 ,
285304 size = Size (width = pg_width , height = pg_height ),
286305 image = ImageRef .from_pil (image = page .image , dpi = 72 )
287306 if page .image
288307 else None ,
289308 )
290309
291- for item , level in page_doc .iterate_items ():
292- item .prov = [
293- ProvenanceItem (
294- page_no = pg_idx + 1 ,
295- bbox = BoundingBox (
296- t = 0.0 , b = 0.0 , l = 0.0 , r = 0.0
297- ), # FIXME: would be nice not to have to "fake" it
298- charspan = [0 , 0 ],
299- )
300- ]
301- conv_res .document .append_child_item (child = item )
310+ page_docs .append (page_doc )
302311
303- return conv_res .document
312+ final_doc = DoclingDocument .concatenate (docs = page_docs )
313+ return final_doc
304314
305315 def _turn_html_into_doc (self , conv_res ):
306316 def _extract_html_code (text ):
@@ -328,9 +338,9 @@ def _extract_html_code(text):
328338 # No code blocks found, return original text
329339 return text
330340
331- for pg_idx , page in enumerate (conv_res .pages ):
332- page_no = pg_idx + 1 # FIXME: might be incorrect
341+ page_docs = []
333342
343+ for pg_idx , page in enumerate (conv_res .pages ):
334344 predicted_text = ""
335345 if page .predictions .vlm_response :
336346 predicted_text = page .predictions .vlm_response .text + "\n \n "
@@ -341,7 +351,7 @@ def _extract_html_code(text):
341351 out_doc = InputDocument (
342352 path_or_stream = response_bytes ,
343353 filename = conv_res .input .file .name ,
344- format = InputFormat .MD ,
354+ format = InputFormat .HTML ,
345355 backend = HTMLDocumentBackend ,
346356 )
347357 backend = HTMLDocumentBackend (
@@ -350,34 +360,44 @@ def _extract_html_code(text):
350360 )
351361 page_doc = backend .convert ()
352362
363+ # Modify provenance in place for all items in the page document
364+ for item , level in page_doc .iterate_items (
365+ with_groups = True ,
366+ traverse_pictures = True ,
367+ included_content_layers = set (ContentLayer ),
368+ ):
369+ if isinstance (item , DocItem ):
370+ item .prov = [
371+ ProvenanceItem (
372+ page_no = pg_idx + 1 ,
373+ bbox = BoundingBox (
374+ t = 0.0 , b = 0.0 , l = 0.0 , r = 0.0
375+ ), # FIXME: would be nice not to have to "fake" it
376+ charspan = [0 , 0 ],
377+ )
378+ ]
379+
380+ # Add page metadata to the page document before concatenation
353381 if page .image is not None :
354382 pg_width = page .image .width
355383 pg_height = page .image .height
356384 else :
357385 pg_width = 1
358386 pg_height = 1
359387
360- conv_res . document .add_page (
361- page_no = page_no ,
388+ page_doc .add_page (
389+ page_no = pg_idx + 1 ,
362390 size = Size (width = pg_width , height = pg_height ),
363391 image = ImageRef .from_pil (image = page .image , dpi = 72 )
364392 if page .image
365393 else None ,
366394 )
367395
368- for item , level in page_doc .iterate_items ():
369- item .prov = [
370- ProvenanceItem (
371- page_no = pg_idx + 1 ,
372- bbox = BoundingBox (
373- t = 0.0 , b = 0.0 , l = 0.0 , r = 0.0
374- ), # FIXME: would be nice not to have to "fake" it
375- charspan = [0 , 0 ],
376- )
377- ]
378- conv_res .document .append_child_item (child = item )
396+ page_docs .append (page_doc )
379397
380- return conv_res .document
398+ # Concatenate all page documents to preserve hierarchy
399+ final_doc = DoclingDocument .concatenate (docs = page_docs )
400+ return final_doc
381401
382402 @classmethod
383403 def get_default_options (cls ) -> VlmPipelineOptions :
0 commit comments