1212import uuid
1313import asyncio
1414from fastapi .responses import Response
15- from fastapi import File , UploadFile , HTTPException
15+ from fastapi import UploadFile , HTTPException
1616from encapsulation .data_model .orm_models import FileMetadata , FileStatus
1717
1818class Knowledge (AbstractModule ):
@@ -22,10 +22,9 @@ def __init__(self, config: 'KnowledgeConfig'):
2222 self .file_index = config .index_manager_config .build ()
2323
2424 # Semaphore to control concurrent indexing operations
25- max_concurrent_indexing = config .max_concurrent_indexing
26- self .indexing_semaphore = asyncio .Semaphore (max_concurrent_indexing )
25+ self .indexing_semaphore = asyncio .Semaphore (config .max_concurrent_indexing )
2726
28- def upload_file (self , file : UploadFile , user_id : uuid .UUID ) -> str :
27+ async def upload_file (self , file : UploadFile , user_id : uuid .UUID ) -> str :
2928 try :
3029 doc_id = self .file_storage .upload_file (
3130 filename = file .filename ,
@@ -34,29 +33,17 @@ def upload_file(self, file: UploadFile, user_id: uuid.UUID) -> str:
3433 content_type = file .content_type
3534 )
3635 # Start indexing in background (fire-and-forget)
37- self ._start_background_indexing (doc_id )
36+ # execute file indexing without waiting for it to complete
37+ task = asyncio .create_task (self ._index_file_background (doc_id ))
38+ # Add error callback to log any unhandled exceptions
39+ task .add_done_callback (lambda t : logger .error (f"Background indexing task failed: { t .exception ()} " ) if t .exception () else None )
3840 logger .info (f"File { file .filename } uploaded with ID { doc_id } , indexing started in background" )
3941 return doc_id
4042
4143 except Exception as e :
4244 logger .error (e )
4345 raise
4446
45- def _start_background_indexing (self , doc_id : str ):
46- """Start background indexing task safely"""
47- try :
48- # Try to get the current event loop
49- loop = asyncio .get_running_loop ()
50- # If we're in an async context, create the task
51- loop .create_task (self ._index_file_background (doc_id ))
52- except RuntimeError :
53- # No event loop running, start a new one in a thread
54- import threading
55- def run_async ():
56- asyncio .run (self ._index_file_background (doc_id ))
57- thread = threading .Thread (target = run_async , daemon = True )
58- thread .start ()
59-
6047 async def _index_file_background (self , doc_id : str ):
6148 """Background task for indexing files with semaphore control"""
6249 async with self .indexing_semaphore :
@@ -178,7 +165,7 @@ def count_user_files(
178165 logger .error (f"Failed to count files for user { user_id } : { e } " )
179166 raise HTTPException (status_code = 500 , detail = f"Failed to count files: { str (e )} " )
180167
181- def trigger_indexing (self , file_ids : List [str ], user_id : uuid .UUID ) -> Dict [ str , Any ] :
168+ async def trigger_indexing (self , file_ids : List [str ], user_id : uuid .UUID ) -> str :
182169 """
183170 Trigger indexing for multiple files asynchronously.
184171
@@ -187,87 +174,79 @@ def trigger_indexing(self, file_ids: List[str], user_id: uuid.UUID) -> Dict[str,
187174 user_id: UUID of the user requesting the indexing
188175
189176 Returns:
190- Dictionary containing basic info about the triggered indexing
177+ String containing basic info about the triggered indexing or error message
191178 """
192- try :
193- # Validate files and collect valid ones
194- valid_file_ids = []
195- invalid_files = []
196-
197- for file_id in file_ids :
198- try :
199- metadata = self .file_storage .get_file_metadata (file_id )
200- if metadata is None :
201- invalid_files .append (f"File not found: { file_id } " )
202- continue
203- if metadata .owner_id != user_id :
204- invalid_files .append (f"You are not allowed to access file: { file_id } " )
205- continue
206- valid_file_ids .append (file_id )
207- except Exception as e :
208- invalid_files .append (f"Error accessing file { file_id } : { str (e )} " )
179+ # Validate files and collect those eligible for indexing
180+ # Only allow indexing of STORED or FAILED files
181+ # Skip files that are INDEXED, or in intermediate states (PARSED, CHUNKED) indicating processing is in progress
182+ valid_files = []
183+ invalid_files = []
184+ skipped_files = []
185+
186+ for file_id in file_ids :
187+ try :
188+ metadata = self .file_storage .get_file_metadata (file_id )
189+ if not metadata :
190+ invalid_files .append (f"File not found or invalid: { file_id } " )
209191 continue
210-
211- # If no valid files, return error
212- if not valid_file_ids :
213- raise HTTPException (
214- status_code = 400 ,
215- detail = f"No valid files to index. Issues: { '; ' .join (invalid_files )} "
216- )
217-
218- logger .info (f"Triggering async indexing for { len (valid_file_ids )} valid files (out of { len (file_ids )} requested) for user { user_id } " )
219-
220- # Start background indexing task for valid files only
221- self ._start_background_indexing_multiple_files (valid_file_ids , user_id )
222-
223- # Return immediately with basic info
224- message = f"Indexing started for { len (valid_file_ids )} files in background"
192+ if metadata .owner_id != user_id :
193+ invalid_files .append (f"You are not authorized to operate on this file: { file_id } " )
194+ continue
195+
196+ # Only allow indexing for STORED or FAILED files
197+ # Skip files that are already indexed or in intermediate processing states
198+ if metadata .status == FileStatus .STORED or metadata .status == FileStatus .FAILED :
199+ valid_files .append (file_id )
200+ else :
201+ skipped_files .append (file_id )
202+ except Exception as e :
203+ invalid_files .append (file_id )
204+ logger .exception (f"Error accessing file { file_id } " )
205+ continue
206+
207+ # If all files are invalid or already indexed/in progress, directly return
208+ if not valid_files :
209+ message_parts = []
225210 if invalid_files :
226- message += f". Skipped { len (invalid_files )} invalid files: { '; ' .join (invalid_files )} "
227-
228- return {
229- "total_files" : len (file_ids ), # Return original count for consistency with test expectations
230- "status" : "indexing_started" ,
231- "message" : message
232- }
233-
234- except HTTPException :
235- # Re-raise HTTP exceptions (400, 403)
236- raise
237- except Exception as e :
238- logger .error (f"Failed to trigger indexing for user { user_id } : { e } " )
239- raise HTTPException (status_code = 500 , detail = f"Failed to trigger indexing: { str (e )} " )
211+ message_parts .append (f"Invalid files: { '; ' .join (invalid_files )} " )
212+ if skipped_files :
213+ message_parts .append (f"Skipped files (already indexed or in progress): { '; ' .join (skipped_files )} " )
214+ message_parts .append ("No files scheduled for indexing." )
215+ return "\n " .join (message_parts )
240216
241- def _start_background_indexing_multiple_files (self , file_ids : List [str ], user_id : uuid .UUID ):
242- """Start background indexing task for multiple files safely"""
243- try :
244- # Try to get the current event loop
245- loop = asyncio .get_running_loop ()
246- # If we're in an async context, create the task
247- loop .create_task (self ._index_multiple_files_background (file_ids , user_id ))
248- except RuntimeError :
249- # No event loop running, start a new one in a thread
250- import threading
251- def run_async ():
252- asyncio .run (self ._index_multiple_files_background (file_ids , user_id ))
253- thread = threading .Thread (target = run_async , daemon = True )
254- thread .start ()
217+ logger .info (
218+ f"Triggering indexing for files: { '; ' .join (valid_files )} "
219+ )
220+
221+ # Start background indexing task for files not indexed yet only
222+ await self ._index_multiple_files_background (valid_files , user_id )
223+
224+ # Return immediately with basic info
225+ message_parts = [
226+ f"Indexing started for files: { '; ' .join (valid_files )} "
227+ ]
228+ if skipped_files :
229+ message_parts .append (f"Skipped files (already indexed or in progress): { '; ' .join (skipped_files )} " )
230+ if invalid_files :
231+ message_parts .append (f"Invalid files: { '; ' .join (invalid_files )} " )
232+
233+ return "\n " .join (message_parts )
255234
256235 async def _index_multiple_files_background (self , file_ids : List [str ], user_id : uuid .UUID ):
257236 """Background task for indexing multiple files with semaphore control"""
258- async with self . indexing_semaphore :
259- try :
260- logger .info (f"Starting background indexing for { len (file_ids )} files for user { user_id } (semaphore acquired)" )
261-
262- # Use the IndexManager's process_multiple_files method
263- result = self .file_index .process_multiple_files (file_ids )
264-
265- logger .info (f"Background indexing completed for user { user_id } : { result .get ('successful_files' , 0 )} successful, { result .get ('failed_files' , 0 )} failed out of { result .get ('total_files' , 0 )} files" )
266-
267- except Exception as e :
268- logger .error (f"Background indexing failed for user { user_id } : { str (e )} " )
269- finally :
270- logger .debug (f"Background indexing semaphore released for user { user_id } ({ len (file_ids )} files)" )
237+
238+ try :
239+ logger .info (f"Starting background indexing for { len (file_ids )} files for user { user_id } (semaphore acquired)" )
240+
241+ # Use the IndexManager's process_multiple_files method
242+ result = await self .file_index .process_multiple_files (file_ids )
243+
244+ logger .info (f"Background indexing completed for user { user_id } : { result .get ('successful_files' , 0 )} successful, { result .get ('failed_files' , 0 )} failed out of { result .get ('total_files' , 0 )} files" )
245+
246+ except Exception as e :
247+ logger .error (f"Background indexing failed for user { user_id } : { str (e )} " )
248+ finally :
249+ logger .debug (f"Background indexing semaphore released for user { user_id } ({ len (file_ids )} files)" )
271250
272251 def get_indexing_status (self ) -> Dict [str , Any ]:
273252 """
0 commit comments