Skip to content

Commit 0d3e615

Browse files
(feat:attachmentUpload) parse content before upload
1 parent 68e4cf4 commit 0d3e615

File tree

1 file changed

+27
-38
lines changed

1 file changed

+27
-38
lines changed

application/worker.py

+27-38
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
import datetime
88
import mimetypes
99
import requests
10+
import tempfile
1011

1112
from collections import Counter
1213
from urllib.parse import urljoin
1314

14-
from application.storage.storage_creator import StorageCreator
15+
from application.storage.storage_creator import StorageCreator
1516
from application.utils import num_tokens_from_string
1617
from application.core.settings import settings
1718
from application.parser.file.bulk import SimpleDirectoryReader
@@ -209,7 +210,7 @@ def remote_worker(
209210
sync_frequency="never",
210211
operation_mode="upload",
211212
doc_id=None,
212-
):
213+
):
213214
full_path = os.path.join(directory, user, name_job)
214215
if not os.path.exists(full_path):
215216
os.makedirs(full_path)
@@ -324,58 +325,48 @@ def attachment_worker(self, file_info, user):
324325
"""
325326
Process and store a single attachment without vectorization.
326327
"""
327-
328+
328329
mongo = MongoDB.get_client()
329330
db = mongo["docsgpt"]
330331
attachments_collection = db["attachments"]
331-
332+
332333
filename = file_info["filename"]
333334
attachment_id = file_info["attachment_id"]
334335
relative_path = file_info["path"]
335336
file_content = file_info["file_content"]
336-
337+
337338
try:
338339
self.update_state(state="PROGRESS", meta={"current": 10})
339-
340340
storage_type = getattr(settings, "STORAGE_TYPE", "local")
341341
storage = StorageCreator.create_storage(storage_type)
342-
343-
self.update_state(state="PROGRESS", meta={"current": 30, "status": "Saving file"})
344-
file_obj = io.BytesIO(file_content)
345-
storage.save_file(file_obj, relative_path)
346-
347-
def process_document(file_path, **kwargs):
348-
self.update_state(state="PROGRESS", meta={"current": 50, "status": "Processing content"})
349-
342+
self.update_state(state="PROGRESS", meta={"current": 30, "status": "Processing content"})
343+
344+
with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1]) as temp_file:
345+
temp_file.write(file_content)
346+
temp_file.flush()
350347
reader = SimpleDirectoryReader(
351-
input_files=[file_path],
348+
input_files=[temp_file.name],
352349
exclude_hidden=True,
353350
errors="ignore"
354351
)
355352
documents = reader.load_data()
356-
353+
357354
if not documents:
358355
logging.warning(f"No content extracted from file: {filename}")
359356
raise ValueError(f"Failed to extract content from file: {filename}")
360-
357+
361358
content = documents[0].text
362359
token_count = num_tokens_from_string(content)
363-
360+
361+
self.update_state(state="PROGRESS", meta={"current": 60, "status": "Saving file"})
362+
file_obj = io.BytesIO(file_content)
363+
364+
metadata = storage.save_file(file_obj, relative_path)
365+
364366
mime_type = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
365-
366-
metadata = {
367-
"storage_type": storage_type,
368-
}
369-
370-
if storage_type == "s3":
371-
metadata.update({
372-
"bucket_name": getattr(storage, "bucket_name", "docsgpt-test-bucket"),
373-
"uri": f"s3://{storage.bucket_name}/{relative_path}",
374-
"region": getattr(settings, "SAGEMAKER_REGION", "us-east-1")
375-
})
376-
367+
377368
self.update_state(state="PROGRESS", meta={"current": 80, "status": "Storing in database"})
378-
369+
379370
doc_id = ObjectId(attachment_id)
380371
attachments_collection.insert_one({
381372
"_id": doc_id,
@@ -387,12 +378,12 @@ def process_document(file_path, **kwargs):
387378
"date": datetime.datetime.now(),
388379
"metadata": metadata
389380
})
390-
391-
logging.info(f"Stored attachment with ID: {attachment_id}",
381+
382+
logging.info(f"Stored attachment with ID: {attachment_id}",
392383
extra={"user": user})
393-
384+
394385
self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"})
395-
386+
396387
return {
397388
"filename": filename,
398389
"path": relative_path,
@@ -401,9 +392,7 @@ def process_document(file_path, **kwargs):
401392
"mime_type": mime_type,
402393
"metadata": metadata
403394
}
404-
405-
return storage.process_file(relative_path, process_document)
406-
395+
407396
except Exception as e:
408397
logging.error(f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True)
409398
raise

0 commit comments

Comments
 (0)