Skip to content

Commit

Permalink
Bug fix for SVrefine recovering lost variants
Browse files Browse the repository at this point in the history
  • Loading branch information
nhansen committed Dec 26, 2019
1 parent ddd5fa7 commit b29e09a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 18 deletions.
12 changes: 9 additions & 3 deletions lib/NHGRI/SVanalyzer/Comp.pm
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ sub potential_match {
and returns a hash of different distance metrics
measuring how different the two variants are.
Input: None.
Input: If the second and third arguments are the key
'-nocleanup' and a true value, will leave comparison
fasta files and alignment output in place.
Output: Reference to a hash containing distance metrics.
=cut
Expand Down Expand Up @@ -210,7 +212,7 @@ sub calc_distance {
}
else {
# align the alternative haplotypes to each other and evaluate
my ($maxshift, $editdistance) = $self->compare_alt_haplotypes($rs_alt_hap1, $rs_alt_hap2, $id1, $id2);
my ($maxshift, $editdistance) = $self->compare_alt_haplotypes($rs_alt_hap1, $rs_alt_hap2, $id1, $id2, '-nocleanup', $params{'-nocleanup'});
#if (($editdistance/$minhaplength < 0.05) && (abs($maxshift) < $minsvsize)) {
if (abs($maxshift) < $minsvsize) {
my $matchtype = 'NWMATCH';
Expand Down Expand Up @@ -309,7 +311,11 @@ sub compare_alt_haplotypes {
my $id2 = shift || 'seq2';
my %params = @_;

my $workingdir = tempdir( CLEANUP => 1);
if (!$params{'-nocleanup'}) {
$params{'-nocleanup'} = 0;
}

my $workingdir = tempdir( CLEANUP => !($params{'-nocleanup'}), DIR => '.');
my $tmpfasta1 = "$workingdir/$id1.$id2.fa";
$self->write_fasta_file($tmpfasta1, "$id1\_alt", $rs_alt1);
my $tmpfasta2 = "$workingdir/$id2.$id1.fa";
Expand Down
34 changes: 23 additions & 11 deletions scripts/SVmerge
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ WARN("Done");

sub process_commandline {
# Set defaults here
%Opt = ( maxdist => 2000, prefix => 'merged', relshift => 0.2, relsizediff => 0.2, reldist => 0.2, minsize => 0 );
GetOptions(\%Opt, qw( ref=s variants=s fof=s distance_file=s maxdist=i relshift=f relsizediff=f reldist=f minsize=i skip_clusters seqspecific nocleanup prefix=s manual help+ version verbose debug)) || pod2usage(0);
%Opt = ( maxdist => 2000, prefix => 'merged', relshift => 0.2, relsizediff => 0.2, reldist => 0.2, minsize => 0, delimiter => ':' );
GetOptions(\%Opt, qw( ref=s variants=s fof=s distance_file=s maxdist=i relshift=f relsizediff=f reldist=f minsize=i skip_clusters seqspecific nocleanup prefix=s delimiter=s manual help+ version verbose debug)) || pod2usage(0);
if ($Opt{manual}) { pod2usage(verbose => 2); }
if ($Opt{help}) { pod2usage(verbose => $Opt{help}-1); }
if ($Opt{version}) { die "SVmerge, ", q$Revision:$, "\n"; }
Expand Down Expand Up @@ -224,11 +224,19 @@ sub calc_distances {
my $sortedvcf_fh = open_files_and_sort($vcf_file, $fof_file);
my %distances = (); # hash of distance arrays for pairs of ids
my %node_index = (); # indices of nodes
my %unique_ids = (); # to check for uniqueness of IDs across files

while (my $rh_sv = retrieve_next_sorted_sv($sortedvcf_fh)) {

next if (!($rh_sv->{chrom}));

my $id2 = $rh_sv->{id};
if ($unique_ids{$id2}) {
die "Structural variant id $id2 seen more than once in merged set--please specify a unique id for each variant\n";
}
else {
$unique_ids{$id2} = 1;
}
my $index2 = $node_index{$id2};
if (!(defined($index2))) {
push @nodes, $id2;
Expand Down Expand Up @@ -379,7 +387,7 @@ sub find_clusters {
next if ($visited[$i]);
my @connectednodes = dfs($i, $ra_node_edges, \@visited);
my @nodenames = map { $nodes[$_] } @connectednodes;
my $nodestring = join ':', @nodenames;
my $nodestring = join $Opt{delimiter}, @nodenames;
my ($clustercall, $no_cluster_calls, $dummynodestring, $no_exact_matches, $exactsubcluster, $maxdist1, $maxdist2, $maxdist3) =
analyze_cluster($nodestring, $rh_distances);
DEBUG("Chose $clustercall as cluster rep");
Expand Down Expand Up @@ -469,7 +477,7 @@ sub analyze_cluster {
my $nodestring = shift;
my $rh_dists = shift;

my @nodes = split /:/, $nodestring;
my @nodes = split /$Opt{delimiter}/, $nodestring;
my $no_cluster_calls = @nodes;

if ($no_cluster_calls == 1) {
Expand All @@ -484,10 +492,10 @@ sub analyze_cluster {
for (my $j=$i + 1; $j<=$#nodes; $j++) {
my $idlesser = (($nodes[$i] cmp $nodes[$j]) > 0) ? $nodes[$j] : $nodes[$i];
my $idgreater = (($nodes[$i] cmp $nodes[$j]) > 0) ? $nodes[$i] : $nodes[$j];
if (!$rh_dists->{"$idlesser:$idgreater"}) {
if (!$rh_dists->{"$idlesser$Opt{delimiter}$idgreater"}) {
next;
}
my ($thisposdist, $this_dist1, $this_dist2, $this_dist3) = @{$rh_dists->{"$idlesser:$idgreater"}};
my ($thisposdist, $this_dist1, $this_dist2, $this_dist3) = @{$rh_dists->{"$idlesser$Opt{delimiter}$idgreater"}};
$maxdist1 = ($this_dist1 > $maxdist1) ? $this_dist1 : $maxdist1;
$maxdist2 = ($this_dist2 > $maxdist2) ? $this_dist2 : $maxdist2;
$maxdist3 = ($this_dist3 > $maxdist3) ? $this_dist3 : $maxdist3;
Expand All @@ -498,7 +506,7 @@ sub analyze_cluster {
}
my $old_cluster_index = $node_cluster{$idgreater};
my $new_cluster_index = $node_cluster{$idlesser};
$exact_clusters[$new_cluster_index] .= ":".$exact_clusters[$old_cluster_index];
$exact_clusters[$new_cluster_index] .= "$Opt{delimiter}".$exact_clusters[$old_cluster_index];
$exact_clusters[$old_cluster_index] = '';
foreach my $key (keys %node_cluster) {
if ($node_cluster{$key} == $old_cluster_index) {
Expand All @@ -519,7 +527,7 @@ sub analyze_cluster {
my $max_exact_clustersize = 0;

foreach my $unique_cluster (@unique_clusters) {
my @unique_vars = split /:/, $unique_cluster;
my @unique_vars = split /$Opt{delimiter}/, $unique_cluster;
my $no_vars = @unique_vars;
if ($no_vars == $max_exact_clustersize) { # it's a tie
push @largest_exact_clusters, $unique_cluster;
Expand All @@ -532,7 +540,7 @@ sub analyze_cluster {
}

my $exactsubcluster = $largest_exact_clusters[rand()*$#largest_exact_clusters];
my @exact_vars = split /:/, $exactsubcluster;
my @exact_vars = split /$Opt{delimiter}/, $exactsubcluster;
my $clustercall = $exact_vars[rand()*$#exact_vars];

return ($clustercall, $no_cluster_calls, $nodestring, $max_exact_clustersize, $exactsubcluster, $maxdist1, $maxdist2, $maxdist3);
Expand All @@ -545,10 +553,10 @@ sub write_vcf_line_with_info {
my $ra_allowed_info_fields = shift;

my $clustersvcalls = $rh_cluster_info->{'nodestring'};
my @clustercalls = split /:/, $clustersvcalls;
my @clustercalls = split /$Opt{delimiter}/, $clustersvcalls;
my $no_clustersvcalls = @clustercalls;
my $exactsvcalls = $rh_cluster_info->{'exactstring'};
my @exactcalls = split /:/, $exactsvcalls;
my @exactcalls = split /$Opt{delimiter}/, $exactsvcalls;
my $no_exactsvcalls = @exactcalls;
my $ra_maxdists = $rh_cluster_info->{'maxdists'};

Expand Down Expand Up @@ -631,6 +639,10 @@ Specify the path to a file of files with paths to VCF files of variants to merge
Specify the maximum distance in bases between the positions of SVs that can be merged.
=item B<--variantdelimiter>
Specify the ASCII character to be used as a delimiter when forming clusters. This must be a character that is not contained in any of the cluster IDs in order for SVmerge to give reliable results.
=back
=head1 AUTHOR
Expand Down
14 changes: 10 additions & 4 deletions scripts/SVrefine
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,19 @@ sub discover_regions {
foreach my $entry_pair (@{$ra_rentry_pairs}) {
my $current_end;
my @aligns = @{$entry_pair->{aligns}};
foreach my $rh_align (sort byleftthenright @aligns) {
foreach my $rh_align (sort byleftthenright @aligns) { # left to right along reference
if ($current_end) {
if ($rh_align->{ref_start} < $current_end) { # overlap
push @{$regions_hash{$chrom}}, [$rh_align->{ref_start}, $current_end];
if (($Opt{maxsize}) && $current_end - $rh_align->{ref_start} < $Opt{maxsize}) {
push @{$regions_hash{$chrom}}, [$rh_align->{ref_start}, $current_end];
}
DEBUG("OVERLAP $chrom:$rh_align->{ref_start}-$current_end\n");
}
else {
push @{$regions_hash{$chrom}}, [$current_end, $rh_align->{ref_start}];
else { # gap
if (($Opt{maxsize}) && $rh_align->{ref_start} - $current_end < $Opt{maxsize}) {
push @{$regions_hash{$chrom}}, [$current_end, $rh_align->{ref_start}];
}
DEBUG("GAP $chrom:$current_end-$rh_align->{ref_start}\n");
}
$no_regions++;
}
Expand Down

0 comments on commit b29e09a

Please sign in to comment.