diff --git a/plot-vcfstats b/plot-vcfstats index d26d1eb3c..c5f44da52 100755 --- a/plot-vcfstats +++ b/plot-vcfstats @@ -136,15 +136,30 @@ sub parse_params exp=>"# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites" }, { - id=>'NRD', - header=>'Non-Reference Discordance (NRD)', - exp=>"# NRD\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" + id=>'GCiAF', + header=>'GCiAF, Genotype concordance by non-reference allele frequency (indels)', + exp=>"# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites" + }, + { + id=>'NRDs', + header=>'Non-Reference Discordance (NRD), SNPs', + exp=>"# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" + }, + { + id=>'NRDi', + header=>'Non-Reference Discordance (NRD), indels', + exp=>"# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance" }, { id=>'GCsS', header=>'GCcS, Genotype concordance by sample (SNPs)', exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches" }, + { + id=>'GCiS', + header=>'GCiS, Genotype concordance by sample (indels)', + exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches" + }, { id=>'PSC', header=>'PSC, Per-sample counts', @@ -534,10 +549,13 @@ sub parse_vcfstats1 if ( $a eq 'ID' ) { merge_id($opts,$$opts{dat}{$a},$dat{$a},$b); } elsif ( ref($dat{$a}{$b}) ne 'ARRAY' ) {$$opts{dat}{$a}{$b} += $dat{$a}{$b} unless $b eq 'number of samples:'; } # SN, Summary numbers, do not sum sample counts - elsif ( $a eq 'NRD' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } + elsif ( $a eq 'NRDs' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } + elsif ( $a eq 'NRDi' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'DP' ) { merge_dp($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'GCsS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } + elsif ( $a eq 'GCiS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'GCsAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } + elsif ( $a eq 'GCiAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'ST' ) { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_str); } elsif ( $a eq 'PSC') { merge_PSC($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } elsif ( $a eq 'IDD') { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num); } @@ -545,6 +563,7 @@ sub parse_vcfstats1 elsif ( $a eq 'ICS') { merge_ICS($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'ICL') { merge_ICL($$opts{dat}{$a}{$b},$dat{$a}{$b}); } elsif ( $a eq 'TSTV') { merge_TSTV($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); } + elsif ( $a eq 'DBG' ) { next; } else { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num_op); } # SiS AF IDD } } @@ -1823,7 +1842,7 @@ sub create_pdf } if ( scalar get_values($opts,2,'GCsAF') ) { - my @vals = get_values($opts,2,'NRD'); + my @vals = get_values($opts,2,'NRDs'); my $nrd = sprintf "%.2f", $vals[0][0]; my $rr = sprintf "%.2f", $vals[0][1]; my $ra = sprintf "%.2f", $vals[0][2]; @@ -1836,7 +1855,7 @@ sub create_pdf \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/REF} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/ALT} & \\multicolumn{1}{>{\\columncolor{hcol1}}c|}{ALT/ALT} & - \\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRD} \\\\ \\hline + \\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRDs} \\\\ \\hline $rr\\% & $ra\\% & $aa\\% & $nrd\\% \\\\ \\end{tabular}}]; tprint $tex, qq[ @@ -1907,7 +1926,7 @@ sub merge_vcfstats my $sid = $$sec{id}; if ( !exists($$opts{dat}{$sid}) ) { next; } - print $fh "# $$sec{header}\n$$sec{exp}"; + print $fh "# $$sec{header}\n$$sec{exp}\n"; for my $id (sort keys %{$$opts{dat}{$sid}}) { for my $rec (@{$$opts{dat}{$sid}{$id}}) @@ -1918,7 +1937,7 @@ sub merge_vcfstats if ( $sid eq 'ID' ) { - print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}"; + print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}\n"; # output summary numbers here for my $id (keys %{$$opts{dat}}) { diff --git a/vcfstats.c b/vcfstats.c index 6f3bb0a06..ec7844f64 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -731,6 +731,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int int gt = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL); if ( gt == GT_UNKN ) continue; int gt2 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL); + if ( gt2 == GT_UNKN ) continue; if ( gt != gt2 ) { bcf_sr_t *reader = &files->readers[0]; @@ -935,44 +936,72 @@ static void print_stats(args_t *args) { printf("SN\t%d\tNumber of samples:\t%d\n", 2, args->files->n_smpl); - printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); - gtcmp_t *stats = args->af_gts_snps; - int nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0}; - for (i=0; im_af; i++) + int x; + for (x=0; x<2; x++) { - int j, n = 0; - for (j=0; j<3; j++) + gtcmp_t *stats; + if ( x==0 ) { - n += stats[i].m[j] + stats[i].mm[j]; - nrd_m[j] += stats[i].m[j]; - nrd_mm[j] += stats[i].mm[j]; + printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + stats = args->af_gts_snps; } - if ( !i || !n ) continue; // skip singleton stats and empty bins - printf("GCsAF\t2\t%f", 100.*(i-1)/(args->m_af-1)); - printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]); - printf("\t%d\t%d\t%d", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]); - printf("\t%f\t%d\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); + else + { + printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n"); + stats = args->af_gts_indels; + } + int nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0}; + for (i=0; im_af; i++) + { + int j, n = 0; + for (j=0; j<3; j++) + { + n += stats[i].m[j] + stats[i].mm[j]; + nrd_m[j] += stats[i].m[j]; + nrd_mm[j] += stats[i].mm[j]; + } + if ( !i || !n ) continue; // skip singleton stats and empty bins + printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1)); + printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]); + printf("\t%d\t%d\t%d", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]); + printf("\t%f\t%d\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n); + } + + if ( x==0 ) + printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + else + printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); + int m = nrd_m[GT_HET_RA] + nrd_m[GT_HOM_AA]; + int mm = nrd_mm[GT_HOM_RR] + nrd_mm[GT_HET_RA] + nrd_mm[GT_HOM_AA]; + printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i', + m+mm ? mm*100.0/(m+mm) : 0, + nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR] ? nrd_mm[GT_HOM_RR]*100.0/(nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR]) : 0, + nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA] ? nrd_mm[GT_HET_RA]*100.0/(nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA]) : 0, + nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA] ? nrd_mm[GT_HOM_AA]*100.0/(nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA]) : 0 + ); } - int m = nrd_m[GT_HET_RA] + nrd_m[GT_HOM_AA]; - int mm = nrd_mm[GT_HOM_RR] + nrd_mm[GT_HET_RA] + nrd_mm[GT_HOM_AA]; - printf("# Non-Reference Discordance (NRD)\n# NRD\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n"); - printf("NRD\t2\t%f\t%f\t%f\t%f\n", - m+mm ? mm*100.0/(m+mm) : 0, - nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR] ? nrd_mm[GT_HOM_RR]*100.0/(nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR]) : 0, - nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA] ? nrd_mm[GT_HET_RA]*100.0/(nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA]) : 0, - nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA] ? nrd_mm[GT_HOM_AA]*100.0/(nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA]) : 0 - ); - - printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n"); - stats = args->smpl_gts_snps; - for (i=0; ifiles->n_smpl; i++) + for (x=0; x<2; x++) { - int m = stats[i].m[GT_HET_RA] + stats[i].m[GT_HOM_AA]; - int mm = stats[i].mm[GT_HOM_RR] + stats[i].mm[GT_HET_RA] + stats[i].mm[GT_HOM_AA]; - printf("GCsS\t2\t%s\t%.3f", args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]); - printf("\t%d\t%d\t%d\n", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]); + gtcmp_t *stats; + if ( x==0 ) + { + printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n"); + stats = args->smpl_gts_snps; + } + else + { + printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n"); + stats = args->smpl_gts_indels; + } + for (i=0; ifiles->n_smpl; i++) + { + int m = stats[i].m[GT_HET_RA] + stats[i].m[GT_HOM_AA]; + int mm = stats[i].mm[GT_HOM_RR] + stats[i].mm[GT_HET_RA] + stats[i].mm[GT_HOM_AA]; + printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); + printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]); + printf("\t%d\t%d\t%d\n", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]); + } } }