Skip to content

Commit a9dbf55

Browse files
authored
feat(document): convert PDF to Markdown with Docling (#959)
Because - Docling is an alternative to pdfplumber that yields more accurate results on PDF->Markdown conversion. This commit - Adds a parameter to select the converter in the PDF to Markdown task. Docling is included in the container with the model artifacts prefetched. - This is a quick prototype and it needs some polish before it's production-ready.
1 parent 2699679 commit a9dbf55

15 files changed

+272
-46
lines changed

Dockerfile

+11-2
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=typ
5151

5252
FROM debian:bullseye-slim
5353

54-
# Install Python, create virtual environment, and install pdfplumber
54+
# Install Python, create virtual environment, install pdfplumber and Docling
5555
RUN apt update && \
5656
apt install -y curl wget xz-utils python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice libsoxr-dev chromium qpdf && \
5757
python3 -m venv /opt/venv && \
58-
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
58+
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers docling docling-core && \
5959
rm -rf /var/lib/apt/lists/*
6060

6161
# Copy FFmpeg from build stage
@@ -90,3 +90,12 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
9090
# Set up ONNX model and environment variable
9191
COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
9292
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
93+
94+
# Prefetch Docling models and set environment variable with the path to the
95+
# artifacts.
96+
ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling
97+
98+
RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline\n" > import_artifacts.py
99+
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
100+
RUN /opt/venv/bin/python import_artifacts.py
101+
RUN rm import_artifacts.py

Dockerfile.dev

+11-2
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ WORKDIR /${SERVICE_NAME}
99

1010
ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION XK6_SQL_VERSION XK6_SQL_POSTGRES_VERSION
1111

12-
# Install Python, create virtual environment, and install pdfplumber
12+
# Install Python, create virtual environment, install pdfplumber and Docling
1313
RUN apt update && \
1414
apt install -y xz-utils python3 python3-venv poppler-utils wv unrtf tidy tesseract-ocr libtesseract-dev libreoffice libsoxr-dev chromium qpdf && \
1515
python3 -m venv /opt/venv && \
16-
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers && \
16+
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers docling docling-core && \
1717
rm -rf /var/lib/apt/lists/*
1818

1919
# Install FFmpeg Static Build
@@ -74,6 +74,15 @@ ENV GODEBUG=tlsrsakex=1
7474
COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
7575
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
7676

77+
# Prefetch Docling models and set environment variable with the path to the
78+
# artifacts.
79+
ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling
80+
81+
RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline\n" > import_artifacts.py
82+
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
83+
RUN /opt/venv/bin/python import_artifacts.py
84+
RUN rm import_artifacts.py
85+
7786
USER nobody:nogroup
7887

7988
ENTRYPOINT ["tail", "-f", "/dev/null"]

pkg/component/operator/document/v0/config/definition.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec: {}
1111
title: Document
1212
type: COMPONENT_TYPE_OPERATOR
1313
uid: e5b290ae-ad53-47c9-a64e-efbc5358520b
14-
version: 0.1.1
14+
version: 0.1.2
1515
sourceUrl: https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/operator/document/v0
1616
description: Manipulate Document files.
1717
releaseStage: RELEASE_STAGE_ALPHA

pkg/component/operator/document/v0/config/tasks.yaml

+11
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@ TASK_CONVERT_TO_MARKDOWN:
3737
resolution:
3838
$ref: '#/$defs/resolution'
3939
uiOrder: 4
40+
use-docling-converter:
41+
# TODO jvallesm: rather than using a boolean to switch between
42+
# converters, use a converter selector.
43+
default: false
44+
description: |
45+
When converting PDF to Markdown, use `docling` instead of
46+
`pdfplumber`. This converter is more resource-intensive but usually
47+
yields more accurate results.
48+
uiOrder: 5
49+
title: Use Docling to convert PDF to Markdown
50+
type: boolean
4051
required:
4152
- document
4253
title: Input

pkg/component/operator/document/v0/convert_document_to_markdown.go

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ func (e *execution) convertDocumentToMarkdown(ctx context.Context, job *base.Job
2929
Filename: inputStruct.Filename,
3030
DisplayAllPageImage: inputStruct.DisplayAllPageImage,
3131
Resolution: inputStruct.Resolution,
32+
UseDoclingConverter: inputStruct.UseDoclingConverter,
3233
}
3334

3435
transformerOutputStruct, err := transformer.ConvertDocumentToMarkdown(&transformerInputStruct, e.getMarkdownTransformer)

pkg/component/operator/document/v0/io.go

+3
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@ package document
22

33
import "github.com/instill-ai/pipeline-backend/pkg/data/format"
44

5+
// ConvertDocumentToMarkdownInput contains the public input parameters for
6+
// `TASK_CONVERT_TO_MARKDOWN`.
57
type ConvertDocumentToMarkdownInput struct {
68
Document format.Document `instill:"document"`
79
DisplayImageTag bool `instill:"display-image-tag,default=false"`
810
Filename string `instill:"filename"`
911
DisplayAllPageImage bool `instill:"display-all-page-image,default=false"`
1012
Resolution int `instill:"resolution,default=300"`
13+
UseDoclingConverter bool `instill:"use-docling-converter,default=false"`
1114
}
1215

1316
type ConvertDocumentToMarkdownOutput struct {

pkg/component/operator/document/v0/transformer/const.go

+9-6
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,19 @@ const (
99
)
1010

1111
var (
12-
13-
//go:embed execution/task_convert_to_markdown.py
14-
taskConvertToMarkdownExecution string
1512
//go:embed pdf_to_markdown/pdf_transformer.py
1613
pdfTransformer string
1714
//go:embed pdf_to_markdown/page_image_processor.py
18-
imageProcessor string
15+
pageImageProcessor string
16+
17+
//go:embed execution/docling_pdf_to_md_converter.py
18+
doclingPDFToMDConverter string
19+
20+
//go:embed execution/pdfplumber_pdf_to_md_converter.py
21+
pdfPlumberPDFToMDConverter string
1922

20-
//go:embed execution/task_convert_to_images.py
21-
taskConvertToImagesExecution string
23+
//go:embed execution/image_converter.py
24+
imageConverter string
2225

2326
//go:embed execution/pdf_checker.py
2427
pdfChecker string
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
from io import BytesIO
2+
import os
3+
import json
4+
import base64
5+
import sys
6+
import re
7+
from docling.document_converter import DocumentConverter, PdfFormatOption
8+
from docling.datamodel.base_models import DocumentStream, InputFormat
9+
from docling.datamodel.pipeline_options import PdfPipelineOptions
10+
from docling_core.types.doc import ImageRefMode, PictureItem
11+
12+
13+
if __name__ == "__main__":
14+
json_str = sys.stdin.buffer.read().decode('utf-8')
15+
params = json.loads(json_str)
16+
display_image_tag = params["display-image-tag"]
17+
display_all_page_image = params["display-all-page-image"]
18+
pdf_string = params["PDF"]
19+
if ("resolution" in params and
20+
params["resolution"] != 0 and
21+
params["resolution"] is not None):
22+
resolution = params["resolution"]
23+
else:
24+
resolution = 300
25+
decoded_bytes = base64.b64decode(pdf_string)
26+
pdf_file_obj = BytesIO(decoded_bytes)
27+
28+
# Convert resolution DPI to image resolution scale
29+
image_resolution_scale = resolution / 72.0
30+
31+
# Initialize variables
32+
images = []
33+
all_page_images = []
34+
page_numbers_with_images = []
35+
elements = []
36+
errors = []
37+
38+
try:
39+
# Configure the pipeline options
40+
# The model artifacts should be prefetched and stored in a location
41+
# through the `DOCLING_ARTIFACTS_PATH` variable.
42+
pipeline_options = PdfPipelineOptions(artifacts_path=os.environ['DOCLING_ARTIFACTS_PATH'])
43+
pipeline_options.images_scale = image_resolution_scale
44+
pipeline_options.generate_page_images = display_all_page_image
45+
pipeline_options.generate_picture_images = True
46+
47+
# Initialize the document converter
48+
source = DocumentStream(name="document.pdf", stream=pdf_file_obj)
49+
converter = DocumentConverter(
50+
format_options={
51+
InputFormat.PDF: PdfFormatOption(
52+
pipeline_options=pipeline_options
53+
)
54+
}
55+
)
56+
57+
# Process the PDF document
58+
doc = converter.convert(source)
59+
60+
# Extract the markdown text per page
61+
markdown_pages = [
62+
doc.document.export_to_markdown(
63+
page_no=i + 1,
64+
image_mode=ImageRefMode.PLACEHOLDER
65+
)
66+
for i in range(doc.document.num_pages())
67+
]
68+
69+
# Format the image placeholder according to current convention
70+
image_counter = [0]
71+
72+
def replace_image(match):
73+
if display_image_tag:
74+
replacement = f"![image {image_counter[0]}]({image_counter[0]})"
75+
image_counter[0] += 1
76+
return replacement
77+
else:
78+
return "" # Remove the image tag if display-image-tag is False
79+
80+
for page in range(len(markdown_pages)):
81+
updated_page = re.sub(
82+
r"<!-- image -->", replace_image, markdown_pages[page]
83+
)
84+
markdown_pages[page] = updated_page
85+
86+
# Join the markdown pages for the body output
87+
result = "\n\n".join(markdown_pages)
88+
89+
# Extract the images/figures from the document
90+
for element, _level in doc.document.iterate_items():
91+
if isinstance(element, PictureItem):
92+
image = element.get_image(doc.document)
93+
images.append(str(element.image.uri))
94+
page_numbers_with_images.append(element.prov[0].page_no)
95+
96+
# Extract images of the full pages for pages that contain images/figures
97+
if display_all_page_image:
98+
for page_no, page in doc.document.pages.items():
99+
if page_no in page_numbers_with_images:
100+
all_page_images.append(str(page.image.uri))
101+
102+
# Collate the output
103+
output = {
104+
"body": result,
105+
"images": images,
106+
"parsing_error": errors,
107+
"all_page_images": all_page_images,
108+
"display_all_page_image": display_all_page_image,
109+
"markdowns": markdown_pages,
110+
}
111+
print(json.dumps(output))
112+
except Exception as e:
113+
print(json.dumps({"system_error": str(e)}))

pkg/component/operator/document/v0/transformer/images.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func ConvertDocumentToImage(inputStruct *ConvertDocumentToImagesTransformerInput
8686
}, nil
8787
}
8888

89-
pythonCode := imageProcessor + pdfTransformer + taskConvertToImagesExecution
89+
pythonCode := pageImageProcessor + pdfTransformer + imageConverter
9090

9191
// We will make this number tunable & configurable in the future.
9292
maxWorkers := 5

pkg/component/operator/document/v0/transformer/markdown.go

+10-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ type ConvertDocumentToMarkdownTransformerInput struct {
1313
Filename string `json:"filename"`
1414
DisplayAllPageImage bool `json:"display-all-page-image"`
1515
Resolution int `json:"resolution"`
16+
UseDoclingConverter bool `json:"use-docling-converter"`
1617
}
1718

1819
type ConvertDocumentToMarkdownTransformerOutput struct {
@@ -71,10 +72,16 @@ func GetMarkdownTransformer(fileExtension string, inputStruct *ConvertDocumentTo
7172
DisplayAllPageImage: inputStruct.DisplayAllPageImage,
7273
Resolution: inputStruct.Resolution,
7374
}
75+
76+
converter := "pdfplumber"
77+
if inputStruct.UseDoclingConverter {
78+
converter = "docling"
79+
}
80+
7481
return PDFToMarkdownTransformer{
7582
FileExtension: fileExtension,
7683
PDFToMarkdownStruct: pdfToMarkdownStruct,
77-
PDFConvertFunc: getPDFConvertFunc("pdfplumber"),
84+
PDFConvertFunc: getPDFConvertFunc(converter),
7885
}, nil
7986
case "doc", "docx":
8087
pdfToMarkdownStruct := pdfToMarkdownInputStruct{
@@ -131,6 +138,8 @@ type pdfToMarkdownInputStruct struct {
131138
// We could provide more converters in the future. For now, we only have one.
132139
func getPDFConvertFunc(converter string) func(pdfToMarkdownInputStruct) (converterOutput, error) {
133140
switch converter {
141+
case "docling":
142+
return convertPDFToMarkdownWithDocling
134143
default:
135144
return convertPDFToMarkdownWithPDFPlumber
136145
}

pkg/component/operator/document/v0/transformer/pdftomarkdown.go

+68-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ type converterOutput struct {
1919
}
2020

2121
func convertPDFToMarkdownWithPDFPlumber(input pdfToMarkdownInputStruct) (converterOutput, error) {
22-
2322
var pdfBase64 string
2423
var err error
2524
base64Text := input.Base64Text
@@ -45,8 +44,75 @@ func convertPDFToMarkdownWithPDFPlumber(input pdfToMarkdownInputStruct) (convert
4544
return output, fmt.Errorf("failed to marshal params: %w", err)
4645
}
4746

48-
pythonCode := imageProcessor + pdfTransformer + taskConvertToMarkdownExecution
47+
pythonCode := pageImageProcessor + pdfTransformer + pdfPlumberPDFToMDConverter
48+
49+
cmdRunner := exec.Command(pythonInterpreter, "-c", pythonCode)
50+
stdin, err := cmdRunner.StdinPipe()
51+
52+
if err != nil {
53+
return output, fmt.Errorf("failed to create stdin pipe: %w", err)
54+
}
55+
errChan := make(chan error, 1)
56+
go func() {
57+
defer stdin.Close()
58+
_, err := stdin.Write(paramsJSON)
59+
if err != nil {
60+
errChan <- err
61+
return
62+
}
63+
errChan <- nil
64+
}()
65+
66+
outputBytes, err := cmdRunner.CombinedOutput()
67+
if err != nil {
68+
errorStr := string(outputBytes)
69+
return output, fmt.Errorf("failed to run python script: %w, %s", err, errorStr)
70+
}
71+
72+
writeErr := <-errChan
73+
if writeErr != nil {
74+
return output, fmt.Errorf("failed to write to stdin: %w", writeErr)
75+
}
76+
77+
err = json.Unmarshal(outputBytes, &output)
78+
if err != nil {
79+
return output, fmt.Errorf("failed to unmarshal output: %w", err)
80+
}
81+
82+
if output.SystemError != "" {
83+
return output, fmt.Errorf("failed to convert pdf to markdown: %s", output.SystemError)
84+
}
85+
86+
return output, nil
87+
}
88+
89+
// TODO jvallesm: refactor converter functions, most of the code is shared.
90+
func convertPDFToMarkdownWithDocling(input pdfToMarkdownInputStruct) (converterOutput, error) {
91+
var pdfBase64 string
92+
var err error
93+
base64Text := input.Base64Text
94+
pdfBase64WithoutMime := util.TrimBase64Mime(base64Text)
95+
pdfBase64 = pdfBase64WithoutMime
96+
if RequiredToRepair(base64Text) {
97+
pdfBase64, err = RepairPDF(pdfBase64WithoutMime) // :question: needed?
98+
if err != nil {
99+
return converterOutput{}, fmt.Errorf("failed to repair PDF: %w", err)
100+
}
101+
}
102+
103+
paramsJSON, err := json.Marshal(map[string]interface{}{
104+
"PDF": pdfBase64,
105+
"display-image-tag": input.DisplayImageTag,
106+
"display-all-page-image": input.DisplayAllPageImage,
107+
"resolution": input.Resolution,
108+
})
109+
var output converterOutput
110+
111+
if err != nil {
112+
return output, fmt.Errorf("failed to marshal params: %w", err)
113+
}
49114

115+
pythonCode := doclingPDFToMDConverter
50116
cmdRunner := exec.Command(pythonInterpreter, "-c", pythonCode)
51117
stdin, err := cmdRunner.StdinPipe()
52118

0 commit comments

Comments
 (0)