chore(document): capture logs from PDF python scripts

jvallesm · jvallesm · commit f4df57dcdc5a · 2025-02-07T09:14:08.000+01:00
diff --git a/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py b/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py
@@ -4,13 +4,27 @@
 import base64
 import sys
 import re
-from contextlib import redirect_stdout
+import logging
+
+# Docling imports
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling_core.types.doc import ImageRefMode, PictureItem
 
 if __name__ == "__main__":
+    # Capture warnings and errors. These are printed to stderr by default, which
+    # will prevent clients from unmarshalling the response.
+    conversion_logs = StringIO()
+    log_handler = logging.StreamHandler(conversion_logs)
+    log_handler.setLevel(logging.WARNING)
+
+    # Remove any existing handlers to avoid duplicate logging
+    logging.getLogger().handlers = []
+
+    # Add the handler to capture warnings/errors
+    logging.getLogger().addHandler(log_handler)
+
     json_str = sys.stdin.buffer.read().decode('utf-8')
     params = json.loads(json_str)
     display_image_tag = params["display-image-tag"]
@@ -55,17 +69,15 @@
         )
 
         # Process the PDF document
-        conversion_logs = StringIO()
-        with redirect_stdout(conversion_logs):
-            doc = converter.convert(source)
-
-            # Extract the markdown text per page
-            markdown_pages = [
-                doc.document.export_to_markdown(
-                    page_no=i + 1,
-                    image_mode=ImageRefMode.PLACEHOLDER
-                )
-                for i in range(doc.document.num_pages())
+        doc = converter.convert(source)
+
+        # Extract the markdown text per page
+        markdown_pages = [
+            doc.document.export_to_markdown(
+                page_no=i + 1,
+                image_mode=ImageRefMode.PLACEHOLDER
+            )
+            for i in range(doc.document.num_pages())
             ]
 
         # Format the image placeholder according to current convention
@@ -113,4 +125,4 @@ def replace_image(match):
         }
         print(json.dumps(output))
     except Exception as e:
-        print(json.dumps({"system_error": str(e)}))
+        print(json.dumps({"system_error": str(e)}), file=sys.stderr)
diff --git a/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py b/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py
@@ -1,8 +1,9 @@
-from io import BytesIO, StringIO
-from contextlib import redirect_stdout
-import json
+# Standard library imports
 import base64
+import json
+import logging
 import sys
+from io import BytesIO, StringIO
 
 # TODO chuang8511:
 # Deal with the import error when running the code in the docker container.
@@ -12,11 +13,23 @@
 
 
 if __name__ == "__main__":
-	json_str = sys.stdin.buffer.read().decode('utf-8')
-	params = json.loads(json_str)
-	display_image_tag = params["display-image-tag"]
-	display_all_page_image = params["display-all-page-image"]
-	pdf_string = params["PDF"]
+    # Capture warnings and errors. These are printed to stderr by default, which
+    # will prevent clients from unmarshalling the response.
+    conversion_logs = StringIO()
+    log_handler = logging.StreamHandler(conversion_logs)
+    log_handler.setLevel(logging.WARNING)
+
+    # Remove any existing handlers to avoid duplicate logging
+    logging.getLogger().handlers = []
+
+    # Add the handler to capture warnings/errors
+    logging.getLogger().addHandler(log_handler)
+
+    json_str = sys.stdin.buffer.read().decode('utf-8')
+    params = json.loads(json_str)
+    display_image_tag = params["display-image-tag"]
+    display_all_page_image = params["display-all-page-image"]
+    pdf_string = params["PDF"]
 	if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None:
 		resolution = params["resolution"]
 	else:
@@ -42,11 +55,9 @@
 			else:
 				pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number]
 
-			conversion_logs = StringIO()
-			with redirect_stdout(conversion_logs):
-				pdf.preprocess()
-				image_index = pdf.image_index
-				result += pdf.execute()
+			pdf.preprocess()
+			image_index = pdf.image_index
+			result += pdf.execute()
 
 			for image in pdf.base64_images:
 				images.append(image)
@@ -75,4 +86,4 @@
 		}
 		print(json.dumps(output))
 	except Exception as e:
-		print(json.dumps({"system_error": str(e)}))
+		print(json.dumps({"system_error": str(e)}), file=sys.stderr)