7
7
import datetime
8
8
import mimetypes
9
9
import requests
10
+ import tempfile
10
11
11
12
from collections import Counter
12
13
from urllib .parse import urljoin
13
14
14
- from application .storage .storage_creator import StorageCreator
15
+ from application .storage .storage_creator import StorageCreator
15
16
from application .utils import num_tokens_from_string
16
17
from application .core .settings import settings
17
18
from application .parser .file .bulk import SimpleDirectoryReader
@@ -209,7 +210,7 @@ def remote_worker(
209
210
sync_frequency = "never" ,
210
211
operation_mode = "upload" ,
211
212
doc_id = None ,
212
- ):
213
+ ):
213
214
full_path = os .path .join (directory , user , name_job )
214
215
if not os .path .exists (full_path ):
215
216
os .makedirs (full_path )
@@ -324,58 +325,48 @@ def attachment_worker(self, file_info, user):
324
325
"""
325
326
Process and store a single attachment without vectorization.
326
327
"""
327
-
328
+
328
329
mongo = MongoDB .get_client ()
329
330
db = mongo ["docsgpt" ]
330
331
attachments_collection = db ["attachments" ]
331
-
332
+
332
333
filename = file_info ["filename" ]
333
334
attachment_id = file_info ["attachment_id" ]
334
335
relative_path = file_info ["path" ]
335
336
file_content = file_info ["file_content" ]
336
-
337
+
337
338
try :
338
339
self .update_state (state = "PROGRESS" , meta = {"current" : 10 })
339
-
340
340
storage_type = getattr (settings , "STORAGE_TYPE" , "local" )
341
341
storage = StorageCreator .create_storage (storage_type )
342
-
343
- self .update_state (state = "PROGRESS" , meta = {"current" : 30 , "status" : "Saving file" })
344
- file_obj = io .BytesIO (file_content )
345
- storage .save_file (file_obj , relative_path )
346
-
347
- def process_document (file_path , ** kwargs ):
348
- self .update_state (state = "PROGRESS" , meta = {"current" : 50 , "status" : "Processing content" })
349
-
342
+ self .update_state (state = "PROGRESS" , meta = {"current" : 30 , "status" : "Processing content" })
343
+
344
+ with tempfile .NamedTemporaryFile (suffix = os .path .splitext (filename )[1 ]) as temp_file :
345
+ temp_file .write (file_content )
346
+ temp_file .flush ()
350
347
reader = SimpleDirectoryReader (
351
- input_files = [file_path ],
348
+ input_files = [temp_file . name ],
352
349
exclude_hidden = True ,
353
350
errors = "ignore"
354
351
)
355
352
documents = reader .load_data ()
356
-
353
+
357
354
if not documents :
358
355
logging .warning (f"No content extracted from file: { filename } " )
359
356
raise ValueError (f"Failed to extract content from file: { filename } " )
360
-
357
+
361
358
content = documents [0 ].text
362
359
token_count = num_tokens_from_string (content )
363
-
360
+
361
+ self .update_state (state = "PROGRESS" , meta = {"current" : 60 , "status" : "Saving file" })
362
+ file_obj = io .BytesIO (file_content )
363
+
364
+ metadata = storage .save_file (file_obj , relative_path )
365
+
364
366
mime_type = mimetypes .guess_type (filename )[0 ] or 'application/octet-stream'
365
-
366
- metadata = {
367
- "storage_type" : storage_type ,
368
- }
369
-
370
- if storage_type == "s3" :
371
- metadata .update ({
372
- "bucket_name" : getattr (storage , "bucket_name" , "docsgpt-test-bucket" ),
373
- "uri" : f"s3://{ storage .bucket_name } /{ relative_path } " ,
374
- "region" : getattr (settings , "SAGEMAKER_REGION" , "us-east-1" )
375
- })
376
-
367
+
377
368
self .update_state (state = "PROGRESS" , meta = {"current" : 80 , "status" : "Storing in database" })
378
-
369
+
379
370
doc_id = ObjectId (attachment_id )
380
371
attachments_collection .insert_one ({
381
372
"_id" : doc_id ,
@@ -387,12 +378,12 @@ def process_document(file_path, **kwargs):
387
378
"date" : datetime .datetime .now (),
388
379
"metadata" : metadata
389
380
})
390
-
391
- logging .info (f"Stored attachment with ID: { attachment_id } " ,
381
+
382
+ logging .info (f"Stored attachment with ID: { attachment_id } " ,
392
383
extra = {"user" : user })
393
-
384
+
394
385
self .update_state (state = "PROGRESS" , meta = {"current" : 100 , "status" : "Complete" })
395
-
386
+
396
387
return {
397
388
"filename" : filename ,
398
389
"path" : relative_path ,
@@ -401,9 +392,7 @@ def process_document(file_path, **kwargs):
401
392
"mime_type" : mime_type ,
402
393
"metadata" : metadata
403
394
}
404
-
405
- return storage .process_file (relative_path , process_document )
406
-
395
+
407
396
except Exception as e :
408
397
logging .error (f"Error processing file { filename } : { e } " , extra = {"user" : user }, exc_info = True )
409
398
raise
0 commit comments