|
| 1 | +from io import BytesIO |
| 2 | +import os |
| 3 | +import json |
| 4 | +import base64 |
| 5 | +import sys |
| 6 | +import re |
| 7 | +from docling.document_converter import DocumentConverter, PdfFormatOption |
| 8 | +from docling.datamodel.base_models import DocumentStream, InputFormat |
| 9 | +from docling.datamodel.pipeline_options import PdfPipelineOptions |
| 10 | +from docling_core.types.doc import ImageRefMode, PictureItem |
| 11 | + |
| 12 | + |
| 13 | +if __name__ == "__main__": |
| 14 | + json_str = sys.stdin.buffer.read().decode('utf-8') |
| 15 | + params = json.loads(json_str) |
| 16 | + display_image_tag = params["display-image-tag"] |
| 17 | + display_all_page_image = params["display-all-page-image"] |
| 18 | + pdf_string = params["PDF"] |
| 19 | + if ("resolution" in params and |
| 20 | + params["resolution"] != 0 and |
| 21 | + params["resolution"] is not None): |
| 22 | + resolution = params["resolution"] |
| 23 | + else: |
| 24 | + resolution = 300 |
| 25 | + decoded_bytes = base64.b64decode(pdf_string) |
| 26 | + pdf_file_obj = BytesIO(decoded_bytes) |
| 27 | + |
| 28 | + # Convert resolution DPI to image resolution scale |
| 29 | + image_resolution_scale = resolution / 72.0 |
| 30 | + |
| 31 | + # Initialize variables |
| 32 | + images = [] |
| 33 | + all_page_images = [] |
| 34 | + page_numbers_with_images = [] |
| 35 | + elements = [] |
| 36 | + errors = [] |
| 37 | + |
| 38 | + try: |
| 39 | + # Configure the pipeline options |
| 40 | + # The model artifacts should be prefetched and stored in a location |
| 41 | + # through the `DOCLING_ARTIFACTS_PATH` variable. |
| 42 | + pipeline_options = PdfPipelineOptions(artifacts_path=os.environ['DOCLING_ARTIFACTS_PATH']) |
| 43 | + pipeline_options.images_scale = image_resolution_scale |
| 44 | + pipeline_options.generate_page_images = display_all_page_image |
| 45 | + pipeline_options.generate_picture_images = True |
| 46 | + |
| 47 | + # Initialize the document converter |
| 48 | + source = DocumentStream(name="document.pdf", stream=pdf_file_obj) |
| 49 | + converter = DocumentConverter( |
| 50 | + format_options={ |
| 51 | + InputFormat.PDF: PdfFormatOption( |
| 52 | + pipeline_options=pipeline_options |
| 53 | + ) |
| 54 | + } |
| 55 | + ) |
| 56 | + |
| 57 | + # Process the PDF document |
| 58 | + doc = converter.convert(source) |
| 59 | + |
| 60 | + # Extract the markdown text per page |
| 61 | + markdown_pages = [ |
| 62 | + doc.document.export_to_markdown( |
| 63 | + page_no=i + 1, |
| 64 | + image_mode=ImageRefMode.PLACEHOLDER |
| 65 | + ) |
| 66 | + for i in range(doc.document.num_pages()) |
| 67 | + ] |
| 68 | + |
| 69 | + # Format the image placeholder according to current convention |
| 70 | + image_counter = [0] |
| 71 | + |
| 72 | + def replace_image(match): |
| 73 | + if display_image_tag: |
| 74 | + replacement = f"![image {image_counter[0]}]({image_counter[0]})" |
| 75 | + image_counter[0] += 1 |
| 76 | + return replacement |
| 77 | + else: |
| 78 | + return "" # Remove the image tag if display-image-tag is False |
| 79 | + |
| 80 | + for page in range(len(markdown_pages)): |
| 81 | + updated_page = re.sub( |
| 82 | + r"<!-- image -->", replace_image, markdown_pages[page] |
| 83 | + ) |
| 84 | + markdown_pages[page] = updated_page |
| 85 | + |
| 86 | + # Join the markdown pages for the body output |
| 87 | + result = "\n\n".join(markdown_pages) |
| 88 | + |
| 89 | + # Extract the images/figures from the document |
| 90 | + for element, _level in doc.document.iterate_items(): |
| 91 | + if isinstance(element, PictureItem): |
| 92 | + image = element.get_image(doc.document) |
| 93 | + images.append(str(element.image.uri)) |
| 94 | + page_numbers_with_images.append(element.prov[0].page_no) |
| 95 | + |
| 96 | + # Extract images of the full pages for pages that contain images/figures |
| 97 | + if display_all_page_image: |
| 98 | + for page_no, page in doc.document.pages.items(): |
| 99 | + if page_no in page_numbers_with_images: |
| 100 | + all_page_images.append(str(page.image.uri)) |
| 101 | + |
| 102 | + # Collate the output |
| 103 | + output = { |
| 104 | + "body": result, |
| 105 | + "images": images, |
| 106 | + "parsing_error": errors, |
| 107 | + "all_page_images": all_page_images, |
| 108 | + "display_all_page_image": display_all_page_image, |
| 109 | + "markdowns": markdown_pages, |
| 110 | + } |
| 111 | + print(json.dumps(output)) |
| 112 | + except Exception as e: |
| 113 | + print(json.dumps({"system_error": str(e)})) |
0 commit comments