Skip to content

Commit f4df57d

Browse files
committed
chore(document): capture logs from PDF python scripts
1 parent 113c4de commit f4df57d

File tree

2 files changed

+50
-27
lines changed

2 files changed

+50
-27
lines changed

pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,27 @@
44
import base64
55
import sys
66
import re
7-
from contextlib import redirect_stdout
7+
import logging
8+
9+
# Docling imports
810
from docling.document_converter import DocumentConverter, PdfFormatOption
911
from docling.datamodel.base_models import DocumentStream, InputFormat
1012
from docling.datamodel.pipeline_options import PdfPipelineOptions
1113
from docling_core.types.doc import ImageRefMode, PictureItem
1214

1315
if __name__ == "__main__":
16+
# Capture warnings and errors. These are printed to stderr by default, which
17+
# will prevent clients from unmarshalling the response.
18+
conversion_logs = StringIO()
19+
log_handler = logging.StreamHandler(conversion_logs)
20+
log_handler.setLevel(logging.WARNING)
21+
22+
# Remove any existing handlers to avoid duplicate logging
23+
logging.getLogger().handlers = []
24+
25+
# Add the handler to capture warnings/errors
26+
logging.getLogger().addHandler(log_handler)
27+
1428
json_str = sys.stdin.buffer.read().decode('utf-8')
1529
params = json.loads(json_str)
1630
display_image_tag = params["display-image-tag"]
@@ -55,17 +69,15 @@
5569
)
5670

5771
# Process the PDF document
58-
conversion_logs = StringIO()
59-
with redirect_stdout(conversion_logs):
60-
doc = converter.convert(source)
61-
62-
# Extract the markdown text per page
63-
markdown_pages = [
64-
doc.document.export_to_markdown(
65-
page_no=i + 1,
66-
image_mode=ImageRefMode.PLACEHOLDER
67-
)
68-
for i in range(doc.document.num_pages())
72+
doc = converter.convert(source)
73+
74+
# Extract the markdown text per page
75+
markdown_pages = [
76+
doc.document.export_to_markdown(
77+
page_no=i + 1,
78+
image_mode=ImageRefMode.PLACEHOLDER
79+
)
80+
for i in range(doc.document.num_pages())
6981
]
7082

7183
# Format the image placeholder according to current convention
@@ -113,4 +125,4 @@ def replace_image(match):
113125
}
114126
print(json.dumps(output))
115127
except Exception as e:
116-
print(json.dumps({"system_error": str(e)}))
128+
print(json.dumps({"system_error": str(e)}), file=sys.stderr)

pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from io import BytesIO, StringIO
2-
from contextlib import redirect_stdout
3-
import json
1+
# Standard library imports
42
import base64
3+
import json
4+
import logging
55
import sys
6+
from io import BytesIO, StringIO
67

78
# TODO chuang8511:
89
# Deal with the import error when running the code in the docker container.
@@ -12,11 +13,23 @@
1213

1314

1415
if __name__ == "__main__":
15-
json_str = sys.stdin.buffer.read().decode('utf-8')
16-
params = json.loads(json_str)
17-
display_image_tag = params["display-image-tag"]
18-
display_all_page_image = params["display-all-page-image"]
19-
pdf_string = params["PDF"]
16+
# Capture warnings and errors. These are printed to stderr by default, which
17+
# will prevent clients from unmarshalling the response.
18+
conversion_logs = StringIO()
19+
log_handler = logging.StreamHandler(conversion_logs)
20+
log_handler.setLevel(logging.WARNING)
21+
22+
# Remove any existing handlers to avoid duplicate logging
23+
logging.getLogger().handlers = []
24+
25+
# Add the handler to capture warnings/errors
26+
logging.getLogger().addHandler(log_handler)
27+
28+
json_str = sys.stdin.buffer.read().decode('utf-8')
29+
params = json.loads(json_str)
30+
display_image_tag = params["display-image-tag"]
31+
display_all_page_image = params["display-all-page-image"]
32+
pdf_string = params["PDF"]
2033
if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None:
2134
resolution = params["resolution"]
2235
else:
@@ -42,11 +55,9 @@
4255
else:
4356
pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number]
4457

45-
conversion_logs = StringIO()
46-
with redirect_stdout(conversion_logs):
47-
pdf.preprocess()
48-
image_index = pdf.image_index
49-
result += pdf.execute()
58+
pdf.preprocess()
59+
image_index = pdf.image_index
60+
result += pdf.execute()
5061

5162
for image in pdf.base64_images:
5263
images.append(image)
@@ -75,4 +86,4 @@
7586
}
7687
print(json.dumps(output))
7788
except Exception as e:
78-
print(json.dumps({"system_error": str(e)}))
89+
print(json.dumps({"system_error": str(e)}), file=sys.stderr)

0 commit comments

Comments
 (0)