@@ -268,6 +268,83 @@ def _query_uploads(
268
268
return final_query , proc .Upload .objects (final_query )
269
269
270
270
271
+ @uploads .command (help = 'List selected uploads' )
272
+ @click .argument ('UPLOADS' , nargs = - 1 )
273
+ @click .option ('--required' , type = str , help = 'The required in JSON format' )
274
+ @click .option ('-o' , '--output' , type = str , help = 'The file to write data to' )
275
+ @click .pass_context
276
+ def export (ctx , uploads , required , output : str ):
277
+ import sys
278
+ from nomad .processing import Entry
279
+ from nomad .utils import get_logger
280
+ from nomad .files import UploadFiles
281
+ from nomad .archive import ArchiveQueryError , RequiredReader
282
+ import time
283
+ import zipfile
284
+
285
+ logger = get_logger (__name__ )
286
+
287
+ if not output :
288
+ logger .error ('no output given' )
289
+ sys .exit (1 )
290
+
291
+ if not output .endswith ('.zip' ):
292
+ logger .error ('only zip output is supported' )
293
+ sys .exit (1 )
294
+
295
+ output_file = zipfile .ZipFile (output , 'w' , allowZip64 = True )
296
+
297
+ def write (entry_id , archive_data ):
298
+ archive_json = json .dumps (archive_data )
299
+ output_file .writestr (f'{ entry_id } .json' , archive_json , compress_type = zipfile .ZIP_DEFLATED )
300
+
301
+ _ , uploads = _query_uploads (uploads , ** ctx .obj .uploads_kwargs )
302
+
303
+ try :
304
+ required_data = json .loads (required )
305
+ except Exception as e :
306
+ logger .error ('could not parse required' , exc_info = e )
307
+ sys .exit (1 )
308
+
309
+ try :
310
+ required_reader = RequiredReader (required_data )
311
+ except Exception as e :
312
+ logger .error ('could not validate required' , exc_info = e )
313
+ sys .exit (1 )
314
+
315
+ def get_rss ():
316
+ return time .time ()
317
+
318
+ start_time = get_rss ()
319
+
320
+ upload_count = 0
321
+ total_count = 0
322
+ for upload in uploads :
323
+ upload_id = upload .upload_id
324
+ upload_files = UploadFiles .get (upload_id )
325
+ upload_count += 1
326
+ entry_ids = list (entry .entry_id for entry in Entry .objects (upload_id = upload_id ))
327
+ entry_count = 0
328
+ for entry_id in entry_ids :
329
+ entry_count += 1
330
+ total_count += 1
331
+ try :
332
+ archive = upload_files .read_archive (entry_id , use_blocked_toc = False )
333
+ archive_data = required_reader .read (archive , entry_id , upload_id )
334
+ write (entry_id , archive_data )
335
+ except ArchiveQueryError as e :
336
+ logger .error ('could not read archive' , exc_info = e , entry_id = entry_id )
337
+ except KeyError as e :
338
+ logger .error ('missing archive' , exc_info = e , entry_id = entry_id )
339
+
340
+ if total_count % 100 == 0 :
341
+ print (f'{ upload_count :5} /{ len (uploads )} { entry_count :5} /{ len (entry_ids )} { total_count :5} { ((get_rss () - start_time ))} { upload_id } ' )
342
+
343
+ upload_files .close ()
344
+
345
+ output_file .close ()
346
+
347
+
271
348
@uploads .command (help = 'List selected uploads' )
272
349
@click .argument ('UPLOADS' , nargs = - 1 )
273
350
@click .option ('-e' , '--entries' , is_flag = True , help = 'Show details about entries.' )
0 commit comments