@@ -133,62 +133,91 @@ def ingest_worker(
133
133
limit = None
134
134
exclude = True
135
135
sample = False
136
+
137
+ storage = StorageCreator .get_storage ()
138
+
136
139
full_path = os .path .join (directory , user , name_job )
137
-
140
+ source_file_path = os .path .join (full_path , filename )
141
+
138
142
logging .info (f"Ingest file: { full_path } " , extra = {"user" : user , "job" : name_job })
139
- file_data = {"name" : name_job , "file" : filename , "user" : user }
143
+
144
+ # Create temporary working directory
145
+ with tempfile .TemporaryDirectory () as temp_dir :
146
+ try :
147
+ os .makedirs (temp_dir , exist_ok = True )
148
+
149
+ # Download file from storage to temp directory
150
+ temp_file_path = os .path .join (temp_dir , filename )
151
+ file_data = storage .get_file (source_file_path )
152
+
153
+ with open (temp_file_path , 'wb' ) as f :
154
+ f .write (file_data .read ())
155
+
156
+ self .update_state (state = "PROGRESS" , meta = {"current" : 1 })
157
+
158
+ # Handle zip files
159
+ if filename .endswith ('.zip' ):
160
+ logging .info (f"Extracting zip file: { filename } " )
161
+ extract_zip_recursive (
162
+ temp_file_path ,
163
+ temp_dir ,
164
+ current_depth = 0 ,
165
+ max_depth = RECURSION_DEPTH
166
+ )
167
+
168
+ if sample :
169
+ logging .info (f"Sample mode enabled. Using { limit } documents." )
140
170
141
- if not os .path .exists (full_path ):
142
- os .makedirs (full_path )
143
- download_file (urljoin (settings .API_URL , "/api/download" ), file_data , os .path .join (full_path , filename ))
171
+ reader = SimpleDirectoryReader (
172
+ input_dir = temp_dir ,
173
+ input_files = input_files ,
174
+ recursive = recursive ,
175
+ required_exts = formats ,
176
+ exclude_hidden = exclude ,
177
+ file_metadata = metadata_from_filename ,
178
+ )
179
+ raw_docs = reader .load_data ()
144
180
145
- # check if file is .zip and extract it
146
- if filename .endswith (".zip" ):
147
- extract_zip_recursive (
148
- os .path .join (full_path , filename ), full_path , 0 , RECURSION_DEPTH
149
- )
181
+ chunker = Chunker (
182
+ chunking_strategy = "classic_chunk" ,
183
+ max_tokens = MAX_TOKENS ,
184
+ min_tokens = MIN_TOKENS ,
185
+ duplicate_headers = False
186
+ )
187
+ raw_docs = chunker .chunk (documents = raw_docs )
188
+
189
+ docs = [Document .to_langchain_format (raw_doc ) for raw_doc in raw_docs ]
190
+
191
+ id = ObjectId ()
192
+
193
+ vector_store_path = os .path .join (temp_dir , 'vector_store' )
194
+ os .makedirs (vector_store_path , exist_ok = True )
195
+
196
+ embed_and_store_documents (docs , vector_store_path , id , self )
197
+
198
+ tokens = count_tokens_docs (docs )
199
+
200
+ self .update_state (state = "PROGRESS" , meta = {"current" : 100 })
201
+
202
+ if sample :
203
+ for i in range (min (5 , len (raw_docs ))):
204
+ logging .info (f"Sample document { i } : { raw_docs [i ]} " )
205
+ file_data = {
206
+ "name" : name_job ,
207
+ "file" : filename ,
208
+ "user" : user ,
209
+ "tokens" : tokens ,
210
+ "retriever" : retriever ,
211
+ "id" : str (id ),
212
+ "type" : "local" ,
213
+ }
150
214
151
- self .update_state (state = "PROGRESS" , meta = {"current" : 1 })
152
215
153
- raw_docs = SimpleDirectoryReader (
154
- input_dir = full_path ,
155
- input_files = input_files ,
156
- recursive = recursive ,
157
- required_exts = formats ,
158
- num_files_limit = limit ,
159
- exclude_hidden = exclude ,
160
- file_metadata = metadata_from_filename ,
161
- ).load_data ()
162
-
163
- chunker = Chunker (
164
- chunking_strategy = "classic_chunk" ,
165
- max_tokens = MAX_TOKENS ,
166
- min_tokens = MIN_TOKENS ,
167
- duplicate_headers = False
168
- )
169
- raw_docs = chunker .chunk (documents = raw_docs )
170
-
171
- docs = [Document .to_langchain_format (raw_doc ) for raw_doc in raw_docs ]
172
- id = ObjectId ()
173
-
174
- embed_and_store_documents (docs , full_path , id , self )
175
- tokens = count_tokens_docs (docs )
176
- self .update_state (state = "PROGRESS" , meta = {"current" : 100 })
177
-
178
- if sample :
179
- for i in range (min (5 , len (raw_docs ))):
180
- logging .info (f"Sample document { i } : { raw_docs [i ]} " )
181
-
182
- file_data .update ({
183
- "tokens" : tokens ,
184
- "retriever" : retriever ,
185
- "id" : str (id ),
186
- "type" : "local" ,
187
- })
188
- upload_index (full_path , file_data )
189
-
190
- # delete local
191
- shutil .rmtree (full_path )
216
+ upload_index (vector_store_path , file_data )
217
+
218
+ except Exception as e :
219
+ logging .error (f"Error in ingest_worker: { e } " , exc_info = True )
220
+ raise
192
221
193
222
return {
194
223
"directory" : directory ,
0 commit comments