99from v03_pipeline .lib .reference_datasets .misc import vcf_to_ht
1010
1111
12+ def remove_duplicate_scores (ht : hl .Table ):
13+ #
14+ # SpliceAI has many duplicate rows of the ilk:
15+ #
16+ # 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|AL645608.1|0.00|0.00|0.00|0.00|2|27|12|1"] |
17+ # 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|SAMD11|0.02|0.01|0.00|0.00|14|38|14|38"]
18+ #
19+ count_ht = ht .group_by (* ht .key ).aggregate (n = hl .agg .count ())
20+ duplicate_variants_ht = count_ht .filter (count_ht .n > 1 )
21+ duplicates_ht = ht .semi_join (duplicate_variants_ht )
22+ non_duplicates_ht = ht .anti_join (duplicates_ht )
23+ return non_duplicates_ht .union (
24+ # Remove rows that 1) are part of duplicate variant groupings
25+ # and 2) contain dots. Then, remove arbitrarily with .distinct()
26+ duplicates_ht .filter (
27+ ~ duplicates_ht .info .SpliceAI [0 ].split (delim = '\\ |' )[1 ].contains ('.' ),
28+ ).distinct (),
29+ )
30+
31+
1232def get_ht (
1333 paths : list [str ],
1434 reference_genome : ReferenceGenome ,
@@ -26,6 +46,7 @@ def get_ht(
2646 # of partititons.
2747 )
2848 ht , _ = checkpoint (ht )
49+ ht = remove_duplicate_scores (ht )
2950
3051 # SpliceAI INFO field description from the VCF header: SpliceAIv1.3 variant annotation. These include
3152 # delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and
@@ -39,7 +60,7 @@ def get_ht(
3960 .map (hl .float32 ),
4061 )
4162 ht = ht .annotate (delta_score = hl .max (ht .delta_scores ))
42- ht = ht .annotate (
63+ return ht .annotate (
4364 splice_consequence_id = hl .if_else (
4465 ht .delta_score > 0 ,
4566 # Splice Consequence enum ID is the index of the max score
@@ -48,6 +69,3 @@ def get_ht(
4869 num_delta_scores ,
4970 ),
5071 ).drop ('delta_scores' )
51- return ht .group_by (* ht .key ).aggregate (
52- splice_consequence_id = hl .agg .min (ht .splice_consequence_id ),
53- )
0 commit comments