Skip to content

Commit 482f982

Browse files
authored
MRG: upgrades; refactor to output frequencies; support skip-mers (#21)
* bump v * cleanup and make thresholds configurable * stop using mamba
1 parent db75e50 commit 482f982

File tree

3 files changed

+161
-109
lines changed

3 files changed

+161
-109
lines changed

.github/workflows/build-test.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ jobs:
2626
auto-update-conda: true
2727
python-version: 3.12
2828
channels: conda-forge,bioconda
29-
miniforge-variant: Mambaforge
3029
miniforge-version: latest
3130
use-mamba: true
3231
mamba-version: "*"

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
name = "sourmash_plugin_pangenomics"
33
description = "sourmash plugin to do pangenomics."
44
readme = "README.md"
5-
requires-python = ">=3.10"
6-
version = "0.2.2"
5+
requires-python = ">=3.11"
6+
version = "0.3.0"
77
authors = [
88
{name = "Colton Baumler", email = "ccbaumler@ucdavis.edu"},
99
{name = "Titus Brown", email = "titus@idyll.org"},
1010
]
1111

12-
dependencies = ["sourmash>=4.8.9,<5", "sourmash_utils>=0.2"]
12+
dependencies = ["sourmash>=4.9.0,<5", "sourmash_utils>=0.3"]
1313

1414
[metadata]
1515
license = { text = "BSD 3-Clause License" }

src/sourmash_plugin_pangenomics.py

Lines changed: 158 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
from sourmash.plugins import CommandLinePlugin
2323
from sourmash.save_load import SaveSignaturesToLocation
2424

25+
DEFAULT_THRESHOLDS = dict(
26+
CENTRAL_CORE = 0.95,
27+
EXTERNAL_CORE = 0.90,
28+
SHELL = 0.10,
29+
INNER_CLOUD = 0.01,
30+
SURFACE_CLOUD = 0.00,
31+
)
2532

2633
CENTRAL_CORE = 1
2734
EXTERNAL_CORE = 2
@@ -168,10 +175,13 @@ def __init__(self, subparser):
168175
p.add_argument("metagenome_sig")
169176
p.add_argument("ranktable_csv_files", nargs="+",
170177
help="rank tables produced by pangenome_ranktable")
178+
p.add_argument("--thresholds",
179+
help="colon-separated thresholds for central core, external core, shell, inner cloud, surface cloud, e.g. 95:90:10:01:00 (which is the default")
171180
sourmash_utils.add_standard_minhash_args(p)
172181

173182
def main(self, args):
174183
super().main(args)
184+
175185
return classify_hashes_main(args)
176186

177187

@@ -354,182 +364,218 @@ def pangenome_merge_main(args):
354364

355365
c.update(ss.minhash.hashes)
356366

357-
if mh is None:
358-
mh = ss.minhash.to_mutable()
359-
else:
360-
mh += ss.minhash
361-
362367
# save!
363368
print(f"Writing output sketches to '{args.output}'")
364369

365370
sig_name = "merged" # @CTB update with --name?
366371

367-
abund_mh = mh.copy_and_clear() # hmm, don't need mh tracked above?
372+
abund_mh = select_mh.copy_and_clear() # hmm, don't need mh tracked above?
368373
abund_mh.track_abundance = True
369374
abund_mh.set_abundances(dict(c))
370375

376+
assert not os.path.exists(args.output) # @CTB
371377
with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs:
372378
print(f"saving to '{args.output}'")
373379

374380
ss = sourmash.SourmashSignature(abund_mh, name=sig_name)
375381
save_sigs.add(ss)
376382

377383

378-
def db_process(
384+
def load_all_sketches(
379385
filename,
380-
ignore_case,
386+
*,
381387
select_mh=None,
382-
lineage_name="None",
383388
):
389+
"""
390+
Process a database of sketches into a single merged pangenome sketch.
391+
"""
384392
print(f"selecting sketches: {select_mh}")
385393

386394
ss_dict = {}
387395
print(f"loading sketches from file '{filename}'")
388396
db = sourmash_utils.load_index_and_select(filename, select_mh)
389397
print(f"'{filename}' contains {len(db)} signatures")
390398

391-
if lineage_name:
392-
print(f"Looking for {lineage_name} signature\n")
399+
assert len(db) == 1, len(db)
393400

394-
# build the search pattern
395-
pattern = lineage_name
396-
if ignore_case:
397-
pattern = re.compile(pattern, re.IGNORECASE)
398-
else:
399-
pattern = re.compile(pattern)
401+
# load the entire database
402+
for n, ss in enumerate(db.signatures()):
403+
if n % 10 == 0:
404+
print(f"...Processing {n} of {len(db)}", end="\r", flush=True)
400405

401-
def search_pattern(vals):
402-
return any(pattern.search(val) for val in vals)
406+
name = ss.name
403407

404-
# find all matching rows.
405-
mf = db.manifest
406-
sub_mf = mf.filter_on_columns(search_pattern, ["name", "filename", "md5"])
408+
mh = ss.minhash
409+
hashes = mh.hashes
410+
ss_dict[name] = hashes
407411

408-
selected_sigs = []
409-
print(f"Found {len(sub_mf)} signatures in '{filename}':")
412+
print(f"...Processed {n+1} of {len(db)} \n")
410413

411-
for n, row in enumerate(sub_mf.rows, start=1):
412-
print(f'{n:<15} \033[0;31m{row.get("name")}\033[0m')
413-
selected_sigs.append(row.get("name"))
414+
return ss_dict
414415

415-
sub_picklist = sub_mf.to_picklist()
416416

417-
try:
418-
db = db.select(picklist=sub_picklist)
419-
except ValueError:
420-
error("Chosen lineage name input not supported")
417+
def load_sketches_by_lineage(filename,
418+
lineage_name,
419+
*,
420+
ignore_case=True,
421+
select_mh=None,
422+
):
423+
"""
424+
Process a database of sketches into a set of merged pangenome sketches,
425+
based on matches to lineage.
426+
"""
427+
print(f"selecting sketches: {select_mh}")
421428

422-
for ss in db.signatures():
423-
name = ss.name
429+
ss_dict = {}
430+
print(f"loading sketches from file '{filename}'")
431+
db = sourmash_utils.load_index_and_select(filename, select_mh)
432+
print(f"'{filename}' contains {len(db)} signatures")
424433

425-
print(f"Found \033[0;31m{name}\033[0m in '{filename}'")
434+
print(f"Looking for {lineage_name} signature\n")
426435

427-
mh = ss.minhash
428-
hashes = mh.hashes
429-
ss_dict[name] = hashes
436+
# build the search pattern
437+
if ignore_case:
438+
pattern = re.compile(lineage_name, re.IGNORECASE)
439+
else:
440+
pattern = re.compile(lineage_name)
430441

431-
total_rows_examined = 0
432-
total_rows_examined += len(mf)
442+
def search_pattern(vals):
443+
return any(pattern.search(val) for val in vals)
433444

434-
print(
435-
f"\nloaded {total_rows_examined} total that matched ksize & molecule type"
436-
)
445+
# find all matching rows.
446+
mf = db.manifest
447+
sub_mf = mf.filter_on_columns(search_pattern, ["name", "filename", "md5"])
437448

438-
if ss_dict:
439-
print(f"extracted {len(ss_dict)} signatures from {len(db)} file(s)\n")
449+
selected_sigs = []
450+
print(f"Found {len(sub_mf)} signatures in '{filename}':")
440451

441-
else:
442-
print("no matching signatures found!\n")
443-
sys.exit(-1)
452+
for n, row in enumerate(sub_mf.rows, start=1):
453+
print(f'{n:<15} \033[0;31m{row.get("name")}\033[0m')
454+
selected_sigs.append(row.get("name"))
444455

445-
else:
446-
# process the entire database
456+
sub_picklist = sub_mf.to_picklist()
447457

448-
for n, ss in enumerate(db.signatures()):
449-
if n % 10 == 0:
450-
print(f"...Processing {n} of {len(db)}", end="\r", flush=True)
458+
try:
459+
db = db.select(picklist=sub_picklist)
460+
except ValueError:
461+
error("Chosen lineage name input not supported")
462+
raise
463+
464+
for ss in db.signatures():
465+
name = ss.name
451466

452-
name = ss.name
467+
print(f"Found \033[0;31m{name}\033[0m in '{filename}'")
453468

454-
mh = ss.minhash
455-
hashes = mh.hashes
456-
ss_dict[name] = hashes
469+
mh = ss.minhash
470+
hashes = mh.hashes
471+
ss_dict[name] = hashes
457472

458-
print(f"...Processed {n+1} of {len(db)} \n")
473+
total_rows_examined = 0
474+
total_rows_examined += len(mf)
475+
476+
print(
477+
f"\nloaded {total_rows_examined} total that matched ksize & molecule type"
478+
)
479+
480+
if ss_dict:
481+
print(f"extracted {len(ss_dict)} signatures from {len(db)} file(s)\n")
482+
483+
else:
484+
print("no matching signatures found!\n")
485+
sys.exit(-1)
459486

460487
return ss_dict
461488

462489

463-
def pangenome_elements(data):
490+
def calc_pangenome_element_frequency(data):
464491
# get the pangenome elements of the dicts for each rank pangenome
465-
for i, (key, nested_dict) in enumerate(data.items()):
466-
max_value = max(nested_dict.values())
467-
468-
central_core_threshold = 0.95 # 0.95 is core , 90% is technically soft core
469-
external_core_threshold = 0.90
470-
shell_threshold = 0.10 # 0.10
471-
inner_cloud_threshold = 0.01 # 0.0 is the full cloud, but trimming (0.001?) may be necessary to create the viz...?
472-
surface_cloud_threshold = 0.00
473-
474-
central_core = []
475-
external_core = []
476-
shell = []
477-
inner_cloud = []
478-
surface_cloud = []
479-
480-
for nested_key, nested_value in nested_dict.items():
481-
if nested_value >= max_value * central_core_threshold:
482-
central_core.append((nested_key, nested_value))
483-
elif nested_value >= max_value * external_core_threshold:
484-
external_core.append((nested_key, nested_value))
485-
elif nested_value >= max_value * shell_threshold:
486-
shell.append((nested_key, nested_value))
487-
elif nested_value >= max_value * inner_cloud_threshold:
488-
inner_cloud.append((nested_key, nested_value))
489-
elif nested_value >= max_value * surface_cloud_threshold:
490-
surface_cloud.append((nested_key, nested_value))
491-
return central_core, external_core, shell, inner_cloud, surface_cloud
492+
for i, (name, hash_dict) in enumerate(data.items()):
493+
# get max abundance in genome
494+
max_value = max(hash_dict.values())
495+
496+
# return all hashvals / hash_abunds, along with associated max value
497+
items = hash_dict.items()
498+
# sort by abund, highest first
499+
items = sorted(items, key=lambda x: -x[1])
500+
for hashval, hash_abund in items:
501+
freq = hash_abund / max_value
502+
freq = round(freq, 4)
503+
yield hashval, freq, hash_abund, max_value
504+
505+
506+
def classify_pangenome_element(freq, *, thresholds=DEFAULT_THRESHOLDS):
507+
# validate thresholds
508+
min_threshold = 1.
509+
for k, v in thresholds.items():
510+
assert v == float(v) # must be number
511+
min_threshold = min(min_threshold, v)
512+
assert min_threshold == 0 # must catch all hashes :)
513+
514+
if freq >= thresholds['CENTRAL_CORE']:
515+
return CENTRAL_CORE
516+
if freq >= thresholds['EXTERNAL_CORE']:
517+
return EXTERNAL_CORE
518+
if freq >= thresholds['SHELL']:
519+
return SHELL
520+
if freq >= thresholds['INNER_CLOUD']:
521+
return INNER_CLOUD
522+
if freq >= thresholds['SURFACE_CLOUD']:
523+
return SURFACE_CLOUD
524+
525+
raise Exception("a hash slipped through the cracks")
492526

493527
#
494528
# pangenome_ranktable
495529
#
496530

531+
# @CTB - rename to count_hashes or something?
497532
def pangenome_ranktable_main(args):
498533
select_mh = sourmash_utils.create_minhash_from_args(args)
499534

500-
ss_dict = db_process(
501-
filename=args.data,
502-
select_mh=select_mh,
503-
lineage_name=args.lineage,
504-
ignore_case=args.ignore_case,
505-
)
506-
results = pangenome_elements(ss_dict)
535+
# load a pre-existing merged/etc database, calc hash info.
536+
if args.lineage:
537+
ss_dict = load_sketches_by_lineage(args.data,
538+
args.lineage,
539+
ignore_case=args.ignore_case,
540+
select_mh=select_mh)
541+
else:
542+
ss_dict = load_all_sketches(args.data, select_mh=select_mh)
543+
544+
frequencies = calc_pangenome_element_frequency(ss_dict)
507545

508546
if args.output_hash_classification:
509547
print(
510548
f"Writing hash classification to CSV file '{args.output_hash_classification}'"
511549
)
512550
with open(args.output_hash_classification, "w", newline="") as fp:
513551
w = csv.writer(fp)
514-
w.writerow(["hashval", "pangenome_classification"])
515-
central_core, external_core, shell, inner_cloud, surface_cloud = results
552+
w.writerow(["hashval", "freq", "abund", "max_abund"])
516553

517-
for xx, classify_code in (
518-
(central_core, CENTRAL_CORE),
519-
(external_core, EXTERNAL_CORE),
520-
(shell, SHELL),
521-
(inner_cloud, INNER_CLOUD),
522-
(surface_cloud, SURFACE_CLOUD),
523-
):
524-
for hashval, _ in xx:
525-
w.writerow([hashval, classify_code])
554+
for hashval, freq, hash_abund, max_value in frequencies:
555+
w.writerow([hashval, freq, hash_abund, max_value])
526556

527557

528558
#
529559
# pangenome_classify
530560
#
531561

532562
def classify_hashes_main(args):
563+
# @CTB print out the thresholds or something
564+
thresholds_str = args.thresholds
565+
thresholds = dict(DEFAULT_THRESHOLDS)
566+
if thresholds_str:
567+
threshold_vals = thresholds_str.split(':')
568+
assert len(threshold_vals) == 5
569+
threshold_vals = [ int(t)/100 for t in threshold_vals ]
570+
assert max(threshold_vals) <= 1
571+
assert min(threshold_vals) >= 0
572+
573+
thresholds['CENTRAL_CORE'] = threshold_vals[0]
574+
thresholds['EXTERNAL_CORE'] = threshold_vals[1]
575+
thresholds['SHELL'] = threshold_vals[2]
576+
thresholds['INNER_CLOUD'] = threshold_vals[3]
577+
thresholds['SURFACE_CLOUD'] = threshold_vals[4]
578+
533579
select_mh = sourmash_utils.create_minhash_from_args(args)
534580
print(f"selecting sketches: {select_mh}")
535581

@@ -543,14 +589,21 @@ def classify_hashes_main(args):
543589
central_core_mh = minhash.copy_and_clear()
544590
shell_mh = minhash.copy_and_clear()
545591

592+
# load in all the frequencies etc, and classfy
546593
for csv_file in args.ranktable_csv_files:
547594
with open(csv_file, "r", newline="") as fp:
548595
r = csv.DictReader(fp)
549596

550597
classify_d = {}
551598
for row in r:
552599
hashval = int(row["hashval"])
553-
classify_as = int(row["pangenome_classification"])
600+
abund = int(row["abund"])
601+
max_abund = int(row["max_abund"])
602+
freq = abund / max_abund
603+
classify_as = classify_pangenome_element(freq,
604+
thresholds=thresholds)
605+
606+
assert hashval not in classify_d, "hashval already encountered"
554607
classify_d[hashval] = classify_as
555608

556609
# now, classify_d has the classifications we care about. Let's

0 commit comments

Comments
 (0)