Bug fix for SVrefine recovering lost variants

nhansen · Dec 26, 2019 · b29e09a · b29e09a
1 parent ddd5fa7
commit b29e09a
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 18 deletions.
diff --git a/lib/NHGRI/SVanalyzer/Comp.pm b/lib/NHGRI/SVanalyzer/Comp.pm
@@ -148,7 +148,9 @@ sub potential_match {
   and returns a hash of different distance metrics 
   measuring how different the two variants are.
 
-  Input:  None.
+  Input:  If the second and third arguments are the key
+          '-nocleanup' and a true value, will leave comparison
+          fasta files and alignment output in place.
   Output: Reference to a hash containing distance metrics.
 
 =cut
@@ -210,7 +212,7 @@ sub calc_distance {
     }
     else {
         # align the alternative haplotypes to each other and evaluate
-        my ($maxshift, $editdistance) = $self->compare_alt_haplotypes($rs_alt_hap1, $rs_alt_hap2, $id1, $id2);
+        my ($maxshift, $editdistance) = $self->compare_alt_haplotypes($rs_alt_hap1, $rs_alt_hap2, $id1, $id2, '-nocleanup',  $params{'-nocleanup'});
         #if (($editdistance/$minhaplength < 0.05) && (abs($maxshift) < $minsvsize)) {
         if (abs($maxshift) < $minsvsize) {
             my $matchtype = 'NWMATCH';
@@ -309,7 +311,11 @@ sub compare_alt_haplotypes {
     my $id2 = shift || 'seq2';
     my %params = @_;
 
-    my $workingdir = tempdir( CLEANUP => 1);
+    if (!$params{'-nocleanup'}) {
+        $params{'-nocleanup'} = 0;
+    }
+
+    my $workingdir = tempdir( CLEANUP => !($params{'-nocleanup'}), DIR => '.');
     my $tmpfasta1 = "$workingdir/$id1.$id2.fa";
     $self->write_fasta_file($tmpfasta1, "$id1\_alt", $rs_alt1);
     my $tmpfasta2 = "$workingdir/$id2.$id1.fa";

diff --git a/scripts/SVmerge b/scripts/SVmerge
@@ -99,8 +99,8 @@ WARN("Done");
 
 sub process_commandline {
     # Set defaults here
-    %Opt = ( maxdist => 2000, prefix => 'merged', relshift => 0.2, relsizediff => 0.2, reldist => 0.2, minsize => 0 );
-    GetOptions(\%Opt, qw( ref=s variants=s fof=s distance_file=s maxdist=i relshift=f relsizediff=f reldist=f minsize=i skip_clusters seqspecific nocleanup prefix=s manual help+ version verbose debug)) || pod2usage(0);
+    %Opt = ( maxdist => 2000, prefix => 'merged', relshift => 0.2, relsizediff => 0.2, reldist => 0.2, minsize => 0, delimiter => ':' );
+    GetOptions(\%Opt, qw( ref=s variants=s fof=s distance_file=s maxdist=i relshift=f relsizediff=f reldist=f minsize=i skip_clusters seqspecific nocleanup prefix=s delimiter=s manual help+ version verbose debug)) || pod2usage(0);
     if ($Opt{manual})  { pod2usage(verbose => 2); }
     if ($Opt{help})    { pod2usage(verbose => $Opt{help}-1); }
     if ($Opt{version}) { die "SVmerge, ", q$Revision:$, "\n"; }
@@ -224,11 +224,19 @@ sub calc_distances {
     my $sortedvcf_fh = open_files_and_sort($vcf_file, $fof_file);
     my %distances = (); # hash of distance arrays for pairs of ids
     my %node_index = (); # indices of nodes
+    my %unique_ids = (); # to check for uniqueness of IDs across files
+
     while (my $rh_sv = retrieve_next_sorted_sv($sortedvcf_fh)) {
 
         next if (!($rh_sv->{chrom}));
 
         my $id2 = $rh_sv->{id};
+        if ($unique_ids{$id2}) {
+            die "Structural variant id $id2 seen more than once in merged set--please specify a unique id for each variant\n";
+        }
+        else {
+            $unique_ids{$id2} = 1;
+        }
         my $index2 = $node_index{$id2};
         if (!(defined($index2))) {
             push @nodes, $id2;
@@ -379,7 +387,7 @@ sub find_clusters {
         next if ($visited[$i]);
         my @connectednodes = dfs($i, $ra_node_edges, \@visited);
         my @nodenames = map { $nodes[$_] } @connectednodes;
-        my $nodestring = join ':', @nodenames;
+        my $nodestring = join $Opt{delimiter}, @nodenames;
         my ($clustercall, $no_cluster_calls, $dummynodestring, $no_exact_matches, $exactsubcluster, $maxdist1, $maxdist2, $maxdist3) =
                      analyze_cluster($nodestring, $rh_distances);
         DEBUG("Chose $clustercall as cluster rep");
@@ -469,7 +477,7 @@ sub analyze_cluster {
     my $nodestring = shift;
     my $rh_dists = shift;
 
-    my @nodes = split /:/, $nodestring;
+    my @nodes = split /$Opt{delimiter}/, $nodestring;
     my $no_cluster_calls = @nodes;
 
     if ($no_cluster_calls == 1) {
@@ -484,10 +492,10 @@ sub analyze_cluster {
         for (my $j=$i + 1; $j<=$#nodes; $j++) {
             my $idlesser = (($nodes[$i] cmp $nodes[$j]) > 0) ? $nodes[$j] : $nodes[$i];
             my $idgreater = (($nodes[$i] cmp $nodes[$j]) > 0) ? $nodes[$i] : $nodes[$j];
-            if (!$rh_dists->{"$idlesser:$idgreater"}) {
+            if (!$rh_dists->{"$idlesser$Opt{delimiter}$idgreater"}) {
                 next;
             }
-            my ($thisposdist, $this_dist1, $this_dist2, $this_dist3) = @{$rh_dists->{"$idlesser:$idgreater"}};
+            my ($thisposdist, $this_dist1, $this_dist2, $this_dist3) = @{$rh_dists->{"$idlesser$Opt{delimiter}$idgreater"}};
             $maxdist1 = ($this_dist1 > $maxdist1) ? $this_dist1 : $maxdist1;
             $maxdist2 = ($this_dist2 > $maxdist2) ? $this_dist2 : $maxdist2;
             $maxdist3 = ($this_dist3 > $maxdist3) ? $this_dist3 : $maxdist3;
@@ -498,7 +506,7 @@ sub analyze_cluster {
                     }
                     my $old_cluster_index = $node_cluster{$idgreater};
                     my $new_cluster_index = $node_cluster{$idlesser};
-                    $exact_clusters[$new_cluster_index] .= ":".$exact_clusters[$old_cluster_index];
+                    $exact_clusters[$new_cluster_index] .= "$Opt{delimiter}".$exact_clusters[$old_cluster_index];
                     $exact_clusters[$old_cluster_index] = '';
                     foreach my $key (keys %node_cluster) {
                         if ($node_cluster{$key} == $old_cluster_index) {
@@ -519,7 +527,7 @@ sub analyze_cluster {
     my $max_exact_clustersize = 0;
 
     foreach my $unique_cluster (@unique_clusters) {
-        my @unique_vars = split /:/, $unique_cluster;
+        my @unique_vars = split /$Opt{delimiter}/, $unique_cluster;
         my $no_vars = @unique_vars;
         if ($no_vars == $max_exact_clustersize) { # it's a tie
             push @largest_exact_clusters, $unique_cluster;
@@ -532,7 +540,7 @@ sub analyze_cluster {
     }
 
     my $exactsubcluster = $largest_exact_clusters[rand()*$#largest_exact_clusters];
-    my @exact_vars = split /:/, $exactsubcluster;
+    my @exact_vars = split /$Opt{delimiter}/, $exactsubcluster;
     my $clustercall = $exact_vars[rand()*$#exact_vars];
 
     return ($clustercall, $no_cluster_calls, $nodestring, $max_exact_clustersize, $exactsubcluster, $maxdist1, $maxdist2, $maxdist3);
@@ -545,10 +553,10 @@ sub write_vcf_line_with_info {
     my $ra_allowed_info_fields = shift;
 
     my $clustersvcalls = $rh_cluster_info->{'nodestring'};
-    my @clustercalls = split /:/, $clustersvcalls;
+    my @clustercalls = split /$Opt{delimiter}/, $clustersvcalls;
     my $no_clustersvcalls = @clustercalls;
     my $exactsvcalls = $rh_cluster_info->{'exactstring'};
-    my @exactcalls = split /:/, $exactsvcalls;
+    my @exactcalls = split /$Opt{delimiter}/, $exactsvcalls;
     my $no_exactsvcalls = @exactcalls;
     my $ra_maxdists = $rh_cluster_info->{'maxdists'};
 
@@ -631,6 +639,10 @@ Specify the path to a file of files with paths to VCF files of variants to merge
 
 Specify the maximum distance in bases between the positions of SVs that can be merged.
 
+=item B<--variantdelimiter>
+
+Specify the ASCII character to be used as a delimiter when forming clusters. This must be a character that is not contained in any of the cluster IDs in order for SVmerge to give reliable results.
+
 =back
 
 =head1 AUTHOR

diff --git a/scripts/SVrefine b/scripts/SVrefine
@@ -207,13 +207,19 @@ sub discover_regions {
         foreach my $entry_pair (@{$ra_rentry_pairs}) {
             my $current_end;
             my @aligns = @{$entry_pair->{aligns}};
-            foreach my $rh_align (sort byleftthenright @aligns) {
+            foreach my $rh_align (sort byleftthenright @aligns) { # left to right along reference
                 if ($current_end) {
                     if ($rh_align->{ref_start} < $current_end) { # overlap
-                        push @{$regions_hash{$chrom}}, [$rh_align->{ref_start}, $current_end];
+                        if (($Opt{maxsize}) && $current_end - $rh_align->{ref_start} < $Opt{maxsize}) {
+                            push @{$regions_hash{$chrom}}, [$rh_align->{ref_start}, $current_end];
+                        }
+                        DEBUG("OVERLAP $chrom:$rh_align->{ref_start}-$current_end\n");
                     }
-                    else {
-                        push @{$regions_hash{$chrom}}, [$current_end, $rh_align->{ref_start}];
+                    else { # gap
+                        if (($Opt{maxsize}) && $rh_align->{ref_start} - $current_end < $Opt{maxsize}) {
+                            push @{$regions_hash{$chrom}}, [$current_end, $rh_align->{ref_start}];
+                        }
+                        DEBUG("GAP $chrom:$current_end-$rh_align->{ref_start}\n");
                     }
                     $no_regions++;
                 }