1
1
# Copyright (C) 2024 Intel Corporation
2
2
# SPDX-License-Identifier: Apache-2.0
3
3
4
- import asyncio
5
4
import base64
6
5
import os
7
6
import subprocess
@@ -55,15 +54,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
55
54
return inputs
56
55
57
56
58
- def read_pdf (file ):
57
+ def read_pdf (file : str ):
59
58
from langchain .document_loaders import PyPDFLoader
60
59
61
60
loader = PyPDFLoader (file )
62
61
docs = loader .load_and_split ()
63
62
return docs
64
63
65
64
66
- def encode_file_to_base64 (file_path ):
65
+ async def encode_file_to_base64 (f : UploadFile ):
67
66
"""Encode the content of a file to a base64 string.
68
67
69
68
Args:
@@ -72,8 +71,7 @@ def encode_file_to_base64(file_path):
72
71
Returns:
73
72
str: The base64 encoded string of the file content.
74
73
"""
75
- with open (file_path , "rb" ) as f :
76
- base64_str = base64 .b64encode (f .read ()).decode ("utf-8" )
74
+ base64_str = await base64 .b64encode (f .read ()).decode ("utf-8" )
77
75
return base64_str
78
76
79
77
@@ -90,6 +88,7 @@ def video2audio(
90
88
"""
91
89
video_data = base64 .b64decode (video_base64 )
92
90
91
+ # TODO: why this processing is not async?
93
92
uid = str (uuid .uuid4 ())
94
93
temp_video_path = f"{ uid } .mp4"
95
94
temp_audio_path = f"{ uid } .mp3"
@@ -115,29 +114,50 @@ def video2audio(
115
114
return audio_base64
116
115
117
116
118
- def read_text_from_file (file , save_file_name ):
117
+ async def read_text_from_file (file : UploadFile ):
118
+ ctype = file .headers ["content-type" ]
119
+ valid = (
120
+ "text/plain" ,
121
+ "application/pdf" ,
122
+ "application/octet-stream" ,
123
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
124
+ )
125
+
126
+ file_content = None
127
+ if ctype not in valid :
128
+ return file_content
129
+
130
+ import aiofiles
119
131
import docx2txt
120
132
from langchain .text_splitter import CharacterTextSplitter
121
133
122
134
# read text file
123
- if file . headers [ "content-type" ] == "text/plain" :
135
+ if ctype == "text/plain" :
124
136
file .file .seek (0 )
125
137
content = file .file .read ().decode ("utf-8" )
126
- # Split text
138
+ # Split text to multiple documents
127
139
text_splitter = CharacterTextSplitter ()
128
- texts = text_splitter .split_text (content )
129
- # Create multiple documents
130
- file_content = texts
131
- # read pdf file
132
- elif file .headers ["content-type" ] == "application/pdf" :
133
- documents = read_pdf (save_file_name )
134
- file_content = [doc .page_content for doc in documents ]
135
- # read docx file
136
- elif (
137
- file .headers ["content-type" ] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
138
- or file .headers ["content-type" ] == "application/octet-stream"
139
- ):
140
- file_content = docx2txt .process (save_file_name )
140
+ return text_splitter .split_text (content )
141
+
142
+ # need a tmp file for rest
143
+ async with aiofiles .tempfile .NamedTemporaryFile () as tmp :
144
+ await tmp .write (await file .read ())
145
+ await tmp .flush ()
146
+
147
+ # read pdf file
148
+ if ctype == "application/pdf" :
149
+ documents = read_pdf (tmp .name )
150
+ file_content = [doc .page_content for doc in documents ]
151
+
152
+ # read docx file
153
+ if ctype in (
154
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ,
155
+ "application/octet-stream" ,
156
+ ):
157
+ file_content = docx2txt .process (tmp .name )
158
+
159
+ # remove temp file
160
+ await tmp .close ()
141
161
142
162
return file_content
143
163
@@ -201,25 +221,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
201
221
file_summaries = []
202
222
if files :
203
223
for file in files :
204
- # Fix concurrency issue with the same file name
205
- # https://github.com/opea-project/GenAIExamples/issues/1279
206
- uid = str (uuid .uuid4 ())
207
- file_path = f"/tmp/{ uid } "
208
-
209
- import aiofiles
210
-
211
- async with aiofiles .open (file_path , "wb" ) as f :
212
- await f .write (await file .read ())
213
224
214
225
if data_type == "text" :
215
- docs = read_text_from_file (file , file_path )
226
+ docs = await read_text_from_file (file )
216
227
elif data_type in ["audio" , "video" ]:
217
- docs = encode_file_to_base64 (file_path )
228
+ docs = await encode_file_to_base64 (file )
218
229
else :
219
230
raise ValueError (f"Data type not recognized: { data_type } " )
220
231
221
- os .remove (file_path )
222
-
223
232
if isinstance (docs , list ):
224
233
file_summaries .extend (docs )
225
234
else :
0 commit comments