22
22
from sourmash .plugins import CommandLinePlugin
23
23
from sourmash .save_load import SaveSignaturesToLocation
24
24
25
+ DEFAULT_THRESHOLDS = dict (
26
+ CENTRAL_CORE = 0.95 ,
27
+ EXTERNAL_CORE = 0.90 ,
28
+ SHELL = 0.10 ,
29
+ INNER_CLOUD = 0.01 ,
30
+ SURFACE_CLOUD = 0.00 ,
31
+ )
25
32
26
33
CENTRAL_CORE = 1
27
34
EXTERNAL_CORE = 2
@@ -168,10 +175,13 @@ def __init__(self, subparser):
168
175
p .add_argument ("metagenome_sig" )
169
176
p .add_argument ("ranktable_csv_files" , nargs = "+" ,
170
177
help = "rank tables produced by pangenome_ranktable" )
178
+ p .add_argument ("--thresholds" ,
179
+ help = "colon-separated thresholds for central core, external core, shell, inner cloud, surface cloud, e.g. 95:90:10:01:00 (which is the default" )
171
180
sourmash_utils .add_standard_minhash_args (p )
172
181
173
182
def main (self , args ):
174
183
super ().main (args )
184
+
175
185
return classify_hashes_main (args )
176
186
177
187
@@ -354,182 +364,218 @@ def pangenome_merge_main(args):
354
364
355
365
c .update (ss .minhash .hashes )
356
366
357
- if mh is None :
358
- mh = ss .minhash .to_mutable ()
359
- else :
360
- mh += ss .minhash
361
-
362
367
# save!
363
368
print (f"Writing output sketches to '{ args .output } '" )
364
369
365
370
sig_name = "merged" # @CTB update with --name?
366
371
367
- abund_mh = mh .copy_and_clear () # hmm, don't need mh tracked above?
372
+ abund_mh = select_mh .copy_and_clear () # hmm, don't need mh tracked above?
368
373
abund_mh .track_abundance = True
369
374
abund_mh .set_abundances (dict (c ))
370
375
376
+ assert not os .path .exists (args .output ) # @CTB
371
377
with sourmash_args .SaveSignaturesToLocation (args .output ) as save_sigs :
372
378
print (f"saving to '{ args .output } '" )
373
379
374
380
ss = sourmash .SourmashSignature (abund_mh , name = sig_name )
375
381
save_sigs .add (ss )
376
382
377
383
378
- def db_process (
384
+ def load_all_sketches (
379
385
filename ,
380
- ignore_case ,
386
+ * ,
381
387
select_mh = None ,
382
- lineage_name = "None" ,
383
388
):
389
+ """
390
+ Process a database of sketches into a single merged pangenome sketch.
391
+ """
384
392
print (f"selecting sketches: { select_mh } " )
385
393
386
394
ss_dict = {}
387
395
print (f"loading sketches from file '{ filename } '" )
388
396
db = sourmash_utils .load_index_and_select (filename , select_mh )
389
397
print (f"'{ filename } ' contains { len (db )} signatures" )
390
398
391
- if lineage_name :
392
- print (f"Looking for { lineage_name } signature\n " )
399
+ assert len (db ) == 1 , len (db )
393
400
394
- # build the search pattern
395
- pattern = lineage_name
396
- if ignore_case :
397
- pattern = re .compile (pattern , re .IGNORECASE )
398
- else :
399
- pattern = re .compile (pattern )
401
+ # load the entire database
402
+ for n , ss in enumerate (db .signatures ()):
403
+ if n % 10 == 0 :
404
+ print (f"...Processing { n } of { len (db )} " , end = "\r " , flush = True )
400
405
401
- def search_pattern (vals ):
402
- return any (pattern .search (val ) for val in vals )
406
+ name = ss .name
403
407
404
- # find all matching rows.
405
- mf = db . manifest
406
- sub_mf = mf . filter_on_columns ( search_pattern , [ " name" , "filename" , "md5" ])
408
+ mh = ss . minhash
409
+ hashes = mh . hashes
410
+ ss_dict [ name ] = hashes
407
411
408
- selected_sigs = []
409
- print (f"Found { len (sub_mf )} signatures in '{ filename } ':" )
412
+ print (f"...Processed { n + 1 } of { len (db )} \n " )
410
413
411
- for n , row in enumerate (sub_mf .rows , start = 1 ):
412
- print (f'{ n :<15} \033 [0;31m{ row .get ("name" )} \033 [0m' )
413
- selected_sigs .append (row .get ("name" ))
414
+ return ss_dict
414
415
415
- sub_picklist = sub_mf .to_picklist ()
416
416
417
- try :
418
- db = db .select (picklist = sub_picklist )
419
- except ValueError :
420
- error ("Chosen lineage name input not supported" )
417
+ def load_sketches_by_lineage (filename ,
418
+ lineage_name ,
419
+ * ,
420
+ ignore_case = True ,
421
+ select_mh = None ,
422
+ ):
423
+ """
424
+ Process a database of sketches into a set of merged pangenome sketches,
425
+ based on matches to lineage.
426
+ """
427
+ print (f"selecting sketches: { select_mh } " )
421
428
422
- for ss in db .signatures ():
423
- name = ss .name
429
+ ss_dict = {}
430
+ print (f"loading sketches from file '{ filename } '" )
431
+ db = sourmash_utils .load_index_and_select (filename , select_mh )
432
+ print (f"'{ filename } ' contains { len (db )} signatures" )
424
433
425
- print (f"Found \033 [0;31m { name } \033 [0m in ' { filename } ' " )
434
+ print (f"Looking for { lineage_name } signature \n " )
426
435
427
- mh = ss .minhash
428
- hashes = mh .hashes
429
- ss_dict [name ] = hashes
436
+ # build the search pattern
437
+ if ignore_case :
438
+ pattern = re .compile (lineage_name , re .IGNORECASE )
439
+ else :
440
+ pattern = re .compile (lineage_name )
430
441
431
- total_rows_examined = 0
432
- total_rows_examined += len ( mf )
442
+ def search_pattern ( vals ):
443
+ return any ( pattern . search ( val ) for val in vals )
433
444
434
- print (
435
- f" \n loaded { total_rows_examined } total that matched ksize & molecule type"
436
- )
445
+ # find all matching rows.
446
+ mf = db . manifest
447
+ sub_mf = mf . filter_on_columns ( search_pattern , [ "name" , "filename" , "md5" ] )
437
448
438
- if ss_dict :
439
- print (f"extracted { len (ss_dict )} signatures from { len ( db ) } file(s) \n " )
449
+ selected_sigs = []
450
+ print (f"Found { len (sub_mf )} signatures in ' { filename } ': " )
440
451
441
- else :
442
- print ("no matching signatures found! \n " )
443
- sys . exit ( - 1 )
452
+ for n , row in enumerate ( sub_mf . rows , start = 1 ) :
453
+ print (f' { n :<15 } \033 [0;31m { row . get ( "name" ) } \033 [0m' )
454
+ selected_sigs . append ( row . get ( "name" ) )
444
455
445
- else :
446
- # process the entire database
456
+ sub_picklist = sub_mf .to_picklist ()
447
457
448
- for n , ss in enumerate (db .signatures ()):
449
- if n % 10 == 0 :
450
- print (f"...Processing { n } of { len (db )} " , end = "\r " , flush = True )
458
+ try :
459
+ db = db .select (picklist = sub_picklist )
460
+ except ValueError :
461
+ error ("Chosen lineage name input not supported" )
462
+ raise
463
+
464
+ for ss in db .signatures ():
465
+ name = ss .name
451
466
452
- name = ss . name
467
+ print ( f"Found \033 [0;31m { name } \033 [0m in ' { filename } '" )
453
468
454
- mh = ss .minhash
455
- hashes = mh .hashes
456
- ss_dict [name ] = hashes
469
+ mh = ss .minhash
470
+ hashes = mh .hashes
471
+ ss_dict [name ] = hashes
457
472
458
- print (f"...Processed { n + 1 } of { len (db )} \n " )
473
+ total_rows_examined = 0
474
+ total_rows_examined += len (mf )
475
+
476
+ print (
477
+ f"\n loaded { total_rows_examined } total that matched ksize & molecule type"
478
+ )
479
+
480
+ if ss_dict :
481
+ print (f"extracted { len (ss_dict )} signatures from { len (db )} file(s)\n " )
482
+
483
+ else :
484
+ print ("no matching signatures found!\n " )
485
+ sys .exit (- 1 )
459
486
460
487
return ss_dict
461
488
462
489
463
- def pangenome_elements (data ):
490
+ def calc_pangenome_element_frequency (data ):
464
491
# get the pangenome elements of the dicts for each rank pangenome
465
- for i , (key , nested_dict ) in enumerate (data .items ()):
466
- max_value = max (nested_dict .values ())
467
-
468
- central_core_threshold = 0.95 # 0.95 is core , 90% is technically soft core
469
- external_core_threshold = 0.90
470
- shell_threshold = 0.10 # 0.10
471
- inner_cloud_threshold = 0.01 # 0.0 is the full cloud, but trimming (0.001?) may be necessary to create the viz...?
472
- surface_cloud_threshold = 0.00
473
-
474
- central_core = []
475
- external_core = []
476
- shell = []
477
- inner_cloud = []
478
- surface_cloud = []
479
-
480
- for nested_key , nested_value in nested_dict .items ():
481
- if nested_value >= max_value * central_core_threshold :
482
- central_core .append ((nested_key , nested_value ))
483
- elif nested_value >= max_value * external_core_threshold :
484
- external_core .append ((nested_key , nested_value ))
485
- elif nested_value >= max_value * shell_threshold :
486
- shell .append ((nested_key , nested_value ))
487
- elif nested_value >= max_value * inner_cloud_threshold :
488
- inner_cloud .append ((nested_key , nested_value ))
489
- elif nested_value >= max_value * surface_cloud_threshold :
490
- surface_cloud .append ((nested_key , nested_value ))
491
- return central_core , external_core , shell , inner_cloud , surface_cloud
492
+ for i , (name , hash_dict ) in enumerate (data .items ()):
493
+ # get max abundance in genome
494
+ max_value = max (hash_dict .values ())
495
+
496
+ # return all hashvals / hash_abunds, along with associated max value
497
+ items = hash_dict .items ()
498
+ # sort by abund, highest first
499
+ items = sorted (items , key = lambda x : - x [1 ])
500
+ for hashval , hash_abund in items :
501
+ freq = hash_abund / max_value
502
+ freq = round (freq , 4 )
503
+ yield hashval , freq , hash_abund , max_value
504
+
505
+
506
+ def classify_pangenome_element (freq , * , thresholds = DEFAULT_THRESHOLDS ):
507
+ # validate thresholds
508
+ min_threshold = 1.
509
+ for k , v in thresholds .items ():
510
+ assert v == float (v ) # must be number
511
+ min_threshold = min (min_threshold , v )
512
+ assert min_threshold == 0 # must catch all hashes :)
513
+
514
+ if freq >= thresholds ['CENTRAL_CORE' ]:
515
+ return CENTRAL_CORE
516
+ if freq >= thresholds ['EXTERNAL_CORE' ]:
517
+ return EXTERNAL_CORE
518
+ if freq >= thresholds ['SHELL' ]:
519
+ return SHELL
520
+ if freq >= thresholds ['INNER_CLOUD' ]:
521
+ return INNER_CLOUD
522
+ if freq >= thresholds ['SURFACE_CLOUD' ]:
523
+ return SURFACE_CLOUD
524
+
525
+ raise Exception ("a hash slipped through the cracks" )
492
526
493
527
#
494
528
# pangenome_ranktable
495
529
#
496
530
531
+ # @CTB - rename to count_hashes or something?
497
532
def pangenome_ranktable_main (args ):
498
533
select_mh = sourmash_utils .create_minhash_from_args (args )
499
534
500
- ss_dict = db_process (
501
- filename = args .data ,
502
- select_mh = select_mh ,
503
- lineage_name = args .lineage ,
504
- ignore_case = args .ignore_case ,
505
- )
506
- results = pangenome_elements (ss_dict )
535
+ # load a pre-existing merged/etc database, calc hash info.
536
+ if args .lineage :
537
+ ss_dict = load_sketches_by_lineage (args .data ,
538
+ args .lineage ,
539
+ ignore_case = args .ignore_case ,
540
+ select_mh = select_mh )
541
+ else :
542
+ ss_dict = load_all_sketches (args .data , select_mh = select_mh )
543
+
544
+ frequencies = calc_pangenome_element_frequency (ss_dict )
507
545
508
546
if args .output_hash_classification :
509
547
print (
510
548
f"Writing hash classification to CSV file '{ args .output_hash_classification } '"
511
549
)
512
550
with open (args .output_hash_classification , "w" , newline = "" ) as fp :
513
551
w = csv .writer (fp )
514
- w .writerow (["hashval" , "pangenome_classification" ])
515
- central_core , external_core , shell , inner_cloud , surface_cloud = results
552
+ w .writerow (["hashval" , "freq" , "abund" , "max_abund" ])
516
553
517
- for xx , classify_code in (
518
- (central_core , CENTRAL_CORE ),
519
- (external_core , EXTERNAL_CORE ),
520
- (shell , SHELL ),
521
- (inner_cloud , INNER_CLOUD ),
522
- (surface_cloud , SURFACE_CLOUD ),
523
- ):
524
- for hashval , _ in xx :
525
- w .writerow ([hashval , classify_code ])
554
+ for hashval , freq , hash_abund , max_value in frequencies :
555
+ w .writerow ([hashval , freq , hash_abund , max_value ])
526
556
527
557
528
558
#
529
559
# pangenome_classify
530
560
#
531
561
532
562
def classify_hashes_main (args ):
563
+ # @CTB print out the thresholds or something
564
+ thresholds_str = args .thresholds
565
+ thresholds = dict (DEFAULT_THRESHOLDS )
566
+ if thresholds_str :
567
+ threshold_vals = thresholds_str .split (':' )
568
+ assert len (threshold_vals ) == 5
569
+ threshold_vals = [ int (t )/ 100 for t in threshold_vals ]
570
+ assert max (threshold_vals ) <= 1
571
+ assert min (threshold_vals ) >= 0
572
+
573
+ thresholds ['CENTRAL_CORE' ] = threshold_vals [0 ]
574
+ thresholds ['EXTERNAL_CORE' ] = threshold_vals [1 ]
575
+ thresholds ['SHELL' ] = threshold_vals [2 ]
576
+ thresholds ['INNER_CLOUD' ] = threshold_vals [3 ]
577
+ thresholds ['SURFACE_CLOUD' ] = threshold_vals [4 ]
578
+
533
579
select_mh = sourmash_utils .create_minhash_from_args (args )
534
580
print (f"selecting sketches: { select_mh } " )
535
581
@@ -543,14 +589,21 @@ def classify_hashes_main(args):
543
589
central_core_mh = minhash .copy_and_clear ()
544
590
shell_mh = minhash .copy_and_clear ()
545
591
592
+ # load in all the frequencies etc, and classfy
546
593
for csv_file in args .ranktable_csv_files :
547
594
with open (csv_file , "r" , newline = "" ) as fp :
548
595
r = csv .DictReader (fp )
549
596
550
597
classify_d = {}
551
598
for row in r :
552
599
hashval = int (row ["hashval" ])
553
- classify_as = int (row ["pangenome_classification" ])
600
+ abund = int (row ["abund" ])
601
+ max_abund = int (row ["max_abund" ])
602
+ freq = abund / max_abund
603
+ classify_as = classify_pangenome_element (freq ,
604
+ thresholds = thresholds )
605
+
606
+ assert hashval not in classify_d , "hashval already encountered"
554
607
classify_d [hashval ] = classify_as
555
608
556
609
# now, classify_d has the classifications we care about. Let's
0 commit comments