@@ -155,18 +155,19 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
155
155
lst_file_name .append ({'fileName' :obj_source_node .file_name ,'fileSize' :obj_source_node .file_size ,'url' :obj_source_node .url , 'language' :obj_source_node .language , 'status' :'Success' })
156
156
return lst_file_name ,success_count ,failed_count
157
157
158
- def extract_graph_from_file_local_file (graph , model , merged_file_path , fileName , allowedNodes , allowedRelationship ):
158
+ def extract_graph_from_file_local_file (graph , model , merged_file_path , fileName , allowedNodes , allowedRelationship , uri ):
159
159
160
160
logging .info (f'Process file name :{ fileName } ' )
161
161
gcs_file_cache = os .environ .get ('GCS_FILE_CACHE' )
162
162
if gcs_file_cache == 'True' and (fileName .split ('.' )[- 1 ]).upper () == 'PDF' :
163
- file_name , pages = get_documents_from_gcs ( PROJECT_ID , BUCKET_UPLOAD , None , fileName )
163
+ folder_name = create_gcs_bucket_folder_name_hashed (uri , fileName )
164
+ file_name , pages = get_documents_from_gcs ( PROJECT_ID , BUCKET_UPLOAD , None , fileName , folder_name_sha1_hashed = folder_name )
164
165
else :
165
166
file_name , pages , file_extension = get_documents_from_file_by_path (merged_file_path ,fileName )
166
167
if pages == None or len (pages )== 0 :
167
168
raise Exception (f'Pdf content is not available for file : { file_name } ' )
168
169
169
- return processing_source (graph , model , file_name , pages , allowedNodes , allowedRelationship , True , merged_file_path )
170
+ return processing_source (graph , model , file_name , pages , allowedNodes , allowedRelationship , True , merged_file_path , uri )
170
171
171
172
def extract_graph_from_file_s3 (graph , model , source_url , aws_access_key_id , aws_secret_access_key , allowedNodes , allowedRelationship ):
172
173
@@ -206,7 +207,7 @@ def extract_graph_from_file_gcs(graph, model, gcs_project_id, gcs_bucket_name, g
206
207
207
208
return processing_source (graph , model , file_name , pages , allowedNodes , allowedRelationship )
208
209
209
- def processing_source (graph , model , file_name , pages , allowedNodes , allowedRelationship , is_uploaded_from_local = None , merged_file_path = None ):
210
+ def processing_source (graph , model , file_name , pages , allowedNodes , allowedRelationship , is_uploaded_from_local = None , merged_file_path = None , uri = None ):
210
211
"""
211
212
Extracts a Neo4jGraph from a PDF file based on the model.
212
213
@@ -308,7 +309,8 @@ def processing_source(graph, model, file_name, pages, allowedNodes, allowedRelat
308
309
if is_uploaded_from_local :
309
310
gcs_file_cache = os .environ .get ('GCS_FILE_CACHE' )
310
311
if gcs_file_cache == 'True' and (file_name .split ('.' )[- 1 ]).upper ()== 'PDF' :
311
- delete_file_from_gcs (BUCKET_UPLOAD ,file_name )
312
+ folder_name = create_gcs_bucket_folder_name_hashed (uri , file_name )
313
+ delete_file_from_gcs (BUCKET_UPLOAD ,folder_name ,file_name )
312
314
else :
313
315
delete_uploaded_local_file (merged_file_path , file_name )
314
316
@@ -423,7 +425,8 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina
423
425
logging .info (f'gcs file cache: { gcs_file_cache } ' )
424
426
425
427
if gcs_file_cache == 'True' and (originalname .split ('.' )[- 1 ]).upper () == 'PDF' :
426
- upload_file_to_gcs (chunk , chunk_number , originalname , BUCKET_UPLOAD )
428
+ folder_name = create_gcs_bucket_folder_name_hashed (uri ,originalname )
429
+ upload_file_to_gcs (chunk , chunk_number , originalname , BUCKET_UPLOAD , folder_name )
427
430
else :
428
431
if not os .path .exists (chunk_dir ):
429
432
os .mkdir (chunk_dir )
@@ -437,7 +440,8 @@ def upload_file(graph, model, chunk, chunk_number:int, total_chunks:int, origina
437
440
if int (chunk_number ) == int (total_chunks ):
438
441
# If this is the last chunk, merge all chunks into a single file
439
442
if gcs_file_cache == 'True' and (originalname .split ('.' )[- 1 ]).upper ()== 'PDF' :
440
- total_pages , file_size = merge_file_gcs (BUCKET_UPLOAD , originalname )
443
+ file_size = merge_file_gcs (BUCKET_UPLOAD , originalname , folder_name )
444
+ total_pages = 1
441
445
else :
442
446
total_pages , file_size = merge_chunks_local (originalname , int (total_chunks ), chunk_dir , merged_dir )
443
447
@@ -474,7 +478,7 @@ def get_labels_and_relationtypes(graph):
474
478
result = []
475
479
return result
476
480
477
- def manually_cancelled_job (graph , filenames , source_types , merged_dir ):
481
+ def manually_cancelled_job (graph , filenames , source_types , merged_dir , uri ):
478
482
479
483
filename_list = list (map (str .strip , json .loads (filenames )))
480
484
source_types_list = list (map (str .strip , json .loads (source_types )))
@@ -491,7 +495,8 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir):
491
495
obj_source_node = None
492
496
merged_file_path = os .path .join (merged_dir , file_name )
493
497
if source_type == 'local file' and gcs_file_cache == 'True' and (file_name .split ('.' )[- 1 ]).upper ()== 'PDF' :
494
- delete_file_from_gcs (BUCKET_UPLOAD ,file_name )
498
+ folder_name = create_gcs_bucket_folder_name_hashed (uri , file_name )
499
+ delete_file_from_gcs (BUCKET_UPLOAD ,folder_name ,file_name )
495
500
else :
496
501
logging .info (f'Deleted File Path: { merged_file_path } and Deleted File Name : { file_name } ' )
497
502
delete_uploaded_local_file (merged_file_path ,file_name )
0 commit comments