Skip to content

Commit

Permalink
plot-vcfstats: fix bug in merging multiple stats files
Browse files Browse the repository at this point in the history
d4c7d5c introduced new columns
to the DP section of the stats files for the site depth distribiution.
However when run without the -s option this the genotype ditribution
will be now exist, but be filled with zeros causing the bug in samtools#311.

Fixes samtools#311 and adds a test for this. Todo check other merges are
working ok after any column additions.
  • Loading branch information
mcshane committed Sep 2, 2015
1 parent 8e82f7d commit cabeeb2
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 10 deletions.
22 changes: 18 additions & 4 deletions plot-vcfstats
Original file line number Diff line number Diff line change
Expand Up @@ -474,10 +474,24 @@ sub merge_dp
{
my ($a,$b) = @_;
add_to_values($a,$b,\&cmp_num_op);
# recalculate fraction of GTs, cannot be simply summed
my $sum = 0;
for (my $i=0; $i<@$a; $i++) { $sum += $$a[$i][1]; }
for (my $i=0; $i<@$a; $i++) { $$a[$i][2] = $$a[$i][1]*100./$sum; }
# recalculate fraction of GTs and fraction of sites, cannot be simply summed
my $gsum = 0; # genotype sum
my $ssum = 0; # site sum
for (my $i=0; $i<@$a; $i++)
{
$gsum += $$a[$i][1];
if (@{$$a[$i]}>3) {
$ssum += $$a[$i][3];
}
else{
push @{$$a[$i]}, (0,0); # older stats files will not have last 2 columns for (number of sites, fraction of sites), so fill in as zero
}
}
for (my $i=0; $i<@$a; $i++)
{
$$a[$i][2] = $gsum ? $$a[$i][1]*100./$gsum : 0;
$$a[$i][4] = $ssum ? $$a[$i][3]*100./$ssum : 0;
}
}

sub merge_GCsS
Expand Down
2 changes: 2 additions & 0 deletions test/check.chk
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ DP 0 30 2 5.555556 0 0.000000
DP 0 31 16 44.444444 0 0.000000
DP 0 32 4 11.111111 0 0.000000
DP 0 35 4 11.111111 0 0.000000
DP 0 60 0 0.000000 1 33.333333
DP 0 62 0 0.000000 2 66.666667
# PSC, Per-sample counts
# PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons
PSC 0 A 0 2 4 3 2 9 28.7 1
Expand Down
7 changes: 4 additions & 3 deletions test/check.vcf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
##fileformat=VCFv4.1
##INFO=<ID=TEST,Number=1,Type=Integer,Description="Testing Tag">
##FORMAT=<ID=TT,Number=A,Type=Integer,Description="Testing Tag, with commas and \"escapes\" and escaped escapes combined with \\\"quotes\\\\\"">
##INFO=<ID=DP,Number=1,Type=Integer,Description="read depth">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Expand All @@ -27,9 +28,9 @@
1 3184885 . TAAAA TA,T 61.5 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:12:10 1/2:12:10
2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:322:26 1/2:322:26
3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:91:26 1/2:91:26
4 3258448 . TACACACAC T 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258449 . GCAAA GA,G 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258450 . AAAAGAAAAAG A,AAAAAAG 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258448 . TACACACAC T 59.9 PASS DP=62;AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258449 . GCAAA GA,G 59.9 PASS DP=62;AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258450 . AAAAGAAAAAG A,AAAAAAG 59.9 PASS DP=60;AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258451 . AAA AGT 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258452 . AAA AGA 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
4 3258453 . AACA AGA 59.9 PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31
Expand Down
65 changes: 65 additions & 0 deletions test/check_merge.chk
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Definition of sets
# ID [2]id [3]tab-separated file names
# SN, Summary numbers
# SN [2]id [3]key [4]value
SN 0 number of indels: 9
SN 0 number of records: 18
SN 0 number of multiallelic SNP sites: 1
SN 0 number of SNPs: 5
SN 0 number of others: 2
SN 0 number of MNPs: 1
SN 0 number of multiallelic sites: 6
SN 0 number of samples: 2
SN 0 number of no-ALTs: 1
# # TSTV, transition/transversions:
# TSTV [2]id [3]ts [4]tv [5]ts/tv [6]ts (1st ALT) [7]tv (1st ALT) [8]ts/tv (1st ALT)
TSTV 0 3 2 1.50 3 1 3.00
# Sis, Singleton stats
# SiS [2]id [3]allele count [4]number of SNPs [5]number of transitions [6]number of transversions [7]number of indels [8]repeat-consistent [9]repeat-inconsistent [10]not applicable
SiS 0 1 3 1 2 0 0 0 0
# AF, Stats by non-reference allele frequency
# AF [2]id [3]allele frequency [4]number of SNPs [5]number of transitions [6]number of transversions [7]number of indels [8]repeat-consistent [9]repeat-inconsistent [10]not applicable
AF 0 0.000000 3 1 2 2 0 0 2
AF 0 49.000000 0 0 0 12 0 0 12
AF 0 74.000000 1 1 0 0 0 0 0
AF 0 99.000000 1 1 0 0 0 0 0
# IDD, InDel distribution
# IDD [2]id [3]length (deletions negative) [4]count
IDD 0 -10 1
IDD 0 -8 1
IDD 0 -4 3
IDD 0 -3 4
IDD 0 -2 1
IDD 0 -1 2
IDD 0 1 1
IDD 0 2 1
# ST, Substitution types
# ST [2]id [3]type [4]count
ST 0 A>C 0
ST 0 A>G 0
ST 0 A>T 0
ST 0 C>A 0
ST 0 C>G 0
ST 0 C>T 0
ST 0 G>A 3
ST 0 G>C 1
ST 0 G>T 1
ST 0 T>A 0
ST 0 T>C 0
ST 0 T>G 0
# DP, Depth distribution
# DP [2]id [3]bin [4]number of genotypes [5]fraction of genotypes (%) [6]number of sites [7]fraction of sites (%)
DP 0 60 0 0.000000 1 33.333333
DP 0 62 0 0.000000 2 66.666667
# QUAL, Stats by quality
# QUAL [2]id [3]Quality [4]number of SNPs [5]number of transitions (1st ALT) [6]number of transversions (1st ALT) [7]number of indels
QUAL 0 12 1 0 1 1
QUAL 0 45 0 0 0 1
QUAL 0 59 2 1 0 3
QUAL 0 60 1 1 0 0
QUAL 0 61 0 0 0 1
QUAL 0 79 0 0 0 1
QUAL 0 82 0 0 0 1
QUAL 0 90 1 1 0 0
QUAL 0 342 0 0 0 1
16 changes: 13 additions & 3 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
test_vcf_idxstats($opts,in=>'empty',args=>'-s',out=>'empty.idx.out');
test_vcf_idxstats($opts,in=>'empty',args=>'-n',out=>'empty.idx_count.out');
test_vcf_check($opts,in=>'check',out=>'check.chk');
test_vcf_check_merge($opts,in=>'check',out=>'check_merge.chk');
test_vcf_stats($opts,in=>['stats.a','stats.b'],out=>'stats.chk',args=>'-s -');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
Expand Down Expand Up @@ -128,8 +129,6 @@
test_vcf_view($opts,in=>'view.minmaxac',out=>'view.minmaxac.1.out',args=>q[-H -q0.3:major],reg=>'');
test_vcf_call($opts,in=>'mpileup',out=>'mpileup.1.out',args=>'-mv');
test_vcf_call($opts,in=>'mpileup',out=>'mpileup.2.out',args=>'-mvg0');
test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.samples');
test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped');
test_vcf_call_cAls($opts,in=>'mpileup',out=>'mpileup.cAls.out',tab=>'mpileup');
test_vcf_filter($opts,in=>'filter.1',out=>'filter.1.out',args=>'-mx -g2 -G2');
test_vcf_filter($opts,in=>'filter.2',out=>'filter.2.out',args=>q[-e'QUAL==59.2 || (INDEL=0 & (FMT/GQ=25 | FMT/DP=10))' -sModified -S.]);
Expand Down Expand Up @@ -423,6 +422,18 @@ sub test_vcf_check
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools stats -s - $$opts{tmp}/$args{in}.vcf.gz | grep -v '^# The command' | grep -v '^# This' | grep -v '^ID\t'");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools stats -s - | grep -v '^# The command' | grep -v '^# This' | grep -v '^ID\t'");
}

sub test_vcf_check_merge
{
my ($opts,%args) = @_;
bgzip_tabix_vcf($opts,$args{in});
cmd("$$opts{bin}/bcftools stats -r 1 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.1.chk");
cmd("$$opts{bin}/bcftools stats -r 2 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.2.chk");
cmd("$$opts{bin}/bcftools stats -r 3 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.3.chk");
cmd("$$opts{bin}/bcftools stats -r 4 $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.4.chk");
test_cmd($opts,%args,cmd=>"$$opts{bin}/plot-vcfstats -m $$opts{tmp}/$args{in}.1.chk $$opts{tmp}/$args{in}.2.chk $$opts{tmp}/$args{in}.3.chk $$opts{tmp}/$args{in}.4.chk | grep -v 'plot-vcfstats' | grep -v '^# The command' | grep -v '^# This' | grep -v '^ID\t'");
}

sub test_vcf_stats
{
my ($opts,%args) = @_;
Expand Down Expand Up @@ -530,7 +541,6 @@ sub test_vcf_view
sub test_vcf_call
{
my ($opts,%args) = @_;
$args{args} =~ s/{PATH}/$$opts{path}/g;
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call $args{args} $$opts{path}/$args{in}.vcf | grep -v ^##bcftools_");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
Expand Down

0 comments on commit cabeeb2

Please sign in to comment.