Improve DocSum file handling

eero-t · eero-t · commit b59c9df01653 · 2025-05-12T15:18:17.000+03:00
Use temporary file only when necessary, and use aiofiles
own functionality for that.

Signed-off-by: Eero Tamminen &lt;eero.t.tamminen@intel.com&gt;
diff --git a/DocSum/docsum.py b/DocSum/docsum.py
@@ -1,7 +1,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import asyncio
 import base64
 import os
 import subprocess
@@ -55,15 +54,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
     return inputs
 
 
-def read_pdf(file):
+def read_pdf(file: str):
     from langchain.document_loaders import PyPDFLoader
 
     loader = PyPDFLoader(file)
     docs = loader.load_and_split()
     return docs
 
 
-def encode_file_to_base64(file_path):
+async def encode_file_to_base64(f: UploadFile):
     """Encode the content of a file to a base64 string.
 
     Args:
@@ -72,8 +71,7 @@ def encode_file_to_base64(file_path):
     Returns:
         str: The base64 encoded string of the file content.
     """
-    with open(file_path, "rb") as f:
-        base64_str = base64.b64encode(f.read()).decode("utf-8")
+    base64_str = await base64.b64encode(f.read()).decode("utf-8")
     return base64_str
 
 
@@ -90,6 +88,7 @@ def video2audio(
     """
     video_data = base64.b64decode(video_base64)
 
+    # TODO: why this processing is not async?
     uid = str(uuid.uuid4())
     temp_video_path = f"{uid}.mp4"
     temp_audio_path = f"{uid}.mp3"
@@ -115,29 +114,50 @@ def video2audio(
     return audio_base64
 
 
-def read_text_from_file(file, save_file_name):
+async def read_text_from_file(file: UploadFile):
+    ctype = file.headers["content-type"]
+    valid = (
+        "text/plain",
+        "application/pdf",
+        "application/octet-stream",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
+
+    file_content = None
+    if ctype not in valid:
+        return file_content
+
+    import aiofiles
     import docx2txt
     from langchain.text_splitter import CharacterTextSplitter
 
     # read text file
-    if file.headers["content-type"] == "text/plain":
+    if ctype == "text/plain":
         file.file.seek(0)
         content = file.file.read().decode("utf-8")
-        # Split text
+        # Split text to multiple documents
         text_splitter = CharacterTextSplitter()
-        texts = text_splitter.split_text(content)
-        # Create multiple documents
-        file_content = texts
-    # read pdf file
-    elif file.headers["content-type"] == "application/pdf":
-        documents = read_pdf(save_file_name)
-        file_content = [doc.page_content for doc in documents]
-    # read docx file
-    elif (
-        file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        or file.headers["content-type"] == "application/octet-stream"
-    ):
-        file_content = docx2txt.process(save_file_name)
+        return text_splitter.split_text(content)
+
+    # need a tmp file for rest
+    async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
+        await tmp.write(await file.read())
+        await tmp.flush()
+
+        # read pdf file
+        if ctype == "application/pdf":
+            documents = read_pdf(tmp.name)
+            file_content = [doc.page_content for doc in documents]
+
+        # read docx file
+        if ctype in (
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/octet-stream",
+        ):
+            file_content = docx2txt.process(tmp.name)
+
+        # remove temp file
+        await tmp.close()
 
     return file_content
 
@@ -201,25 +221,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
             file_summaries = []
             if files:
                 for file in files:
-                    # Fix concurrency issue with the same file name
-                    # https://github.com/opea-project/GenAIExamples/issues/1279
-                    uid = str(uuid.uuid4())
-                    file_path = f"/tmp/{uid}"
-
-                    import aiofiles
-
-                    async with aiofiles.open(file_path, "wb") as f:
-                        await f.write(await file.read())
 
                     if data_type == "text":
-                        docs = read_text_from_file(file, file_path)
+                        docs = await read_text_from_file(file)
                     elif data_type in ["audio", "video"]:
-                        docs = encode_file_to_base64(file_path)
+                        docs = await encode_file_to_base64(file)
                     else:
                         raise ValueError(f"Data type not recognized: {data_type}")
 
-                    os.remove(file_path)
-
                     if isinstance(docs, list):
                         file_summaries.extend(docs)
                     else: