Skip to content

Commit

Permalink
stats: Indel counterparts of NRD, GCsAF, and GCsS stats
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Jan 14, 2014
1 parent d8cc697 commit 20d1e6e
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 40 deletions.
35 changes: 27 additions & 8 deletions plot-vcfstats
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,30 @@ sub parse_params
exp=>"# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites"
},
{
id=>'NRD',
header=>'Non-Reference Discordance (NRD)',
exp=>"# NRD\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance"
id=>'GCiAF',
header=>'GCiAF, Genotype concordance by non-reference allele frequency (indels)',
exp=>"# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites"
},
{
id=>'NRDs',
header=>'Non-Reference Discordance (NRD), SNPs',
exp=>"# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance"
},
{
id=>'NRDi',
header=>'Non-Reference Discordance (NRD), indels',
exp=>"# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance"
},
{
id=>'GCsS',
header=>'GCcS, Genotype concordance by sample (SNPs)',
exp=>"# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches"
},
{
id=>'GCiS',
header=>'GCiS, Genotype concordance by sample (indels)',
exp=>"# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches"
},
{
id=>'PSC',
header=>'PSC, Per-sample counts',
Expand Down Expand Up @@ -534,17 +549,21 @@ sub parse_vcfstats1

if ( $a eq 'ID' ) { merge_id($opts,$$opts{dat}{$a},$dat{$a},$b); }
elsif ( ref($dat{$a}{$b}) ne 'ARRAY' ) {$$opts{dat}{$a}{$b} += $dat{$a}{$b} unless $b eq 'number of samples:'; } # SN, Summary numbers, do not sum sample counts
elsif ( $a eq 'NRD' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'NRDs' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'NRDi' ) { add_to_avg($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'DP' ) { merge_dp($$opts{dat}{$a}{$b},$dat{$a}{$b}); }
elsif ( $a eq 'GCsS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'GCiS' ) { merge_GCsS($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'GCsAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'GCiAF' ) { merge_GCsAF($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'ST' ) { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_str); }
elsif ( $a eq 'PSC') { merge_PSC($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'IDD') { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num); }
elsif ( $a eq 'FS') { merge_FS($$opts{dat}{$a}{$b},$dat{$a}{$b}); }
elsif ( $a eq 'ICS') { merge_ICS($$opts{dat}{$a}{$b},$dat{$a}{$b}); }
elsif ( $a eq 'ICL') { merge_ICL($$opts{dat}{$a}{$b},$dat{$a}{$b}); }
elsif ( $a eq 'TSTV') { merge_TSTV($$opts{dat}{$a}{$b},$dat{$a}{$b},$i); }
elsif ( $a eq 'DBG' ) { next; }
else { add_to_values($$opts{dat}{$a}{$b},$dat{$a}{$b},\&cmp_num_op); } # SiS AF IDD
}
}
Expand Down Expand Up @@ -1823,7 +1842,7 @@ sub create_pdf
}
if ( scalar get_values($opts,2,'GCsAF') )
{
my @vals = get_values($opts,2,'NRD');
my @vals = get_values($opts,2,'NRDs');
my $nrd = sprintf "%.2f", $vals[0][0];
my $rr = sprintf "%.2f", $vals[0][1];
my $ra = sprintf "%.2f", $vals[0][2];
Expand All @@ -1836,7 +1855,7 @@ sub create_pdf
\\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/REF} &
\\multicolumn{1}{>{\\columncolor{hcol1}}c|}{REF/ALT} &
\\multicolumn{1}{>{\\columncolor{hcol1}}c|}{ALT/ALT} &
\\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRD} \\\\ \\hline
\\multicolumn{1}{>{\\columncolor{hcol1}}c}{NRDs} \\\\ \\hline
$rr\\% & $ra\\% & $aa\\% & $nrd\\% \\\\
\\end{tabular}}];
tprint $tex, qq[
Expand Down Expand Up @@ -1907,7 +1926,7 @@ sub merge_vcfstats
my $sid = $$sec{id};
if ( !exists($$opts{dat}{$sid}) ) { next; }

print $fh "# $$sec{header}\n$$sec{exp}";
print $fh "# $$sec{header}\n$$sec{exp}\n";
for my $id (sort keys %{$$opts{dat}{$sid}})
{
for my $rec (@{$$opts{dat}{$sid}{$id}})
Expand All @@ -1918,7 +1937,7 @@ sub merge_vcfstats

if ( $sid eq 'ID' )
{
print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}";
print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}\n";
# output summary numbers here
for my $id (keys %{$$opts{dat}})
{
Expand Down
93 changes: 61 additions & 32 deletions vcfstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
int gt = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
if ( gt == GT_UNKN ) continue;
int gt2 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
if ( gt2 == GT_UNKN ) continue;
if ( gt != gt2 )
{
bcf_sr_t *reader = &files->readers[0];
Expand Down Expand Up @@ -935,44 +936,72 @@ static void print_stats(args_t *args)
{
printf("SN\t%d\tNumber of samples:\t%d\n", 2, args->files->n_smpl);

printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
gtcmp_t *stats = args->af_gts_snps;
int nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
for (i=0; i<args->m_af; i++)
int x;
for (x=0; x<2; x++)
{
int j, n = 0;
for (j=0; j<3; j++)
gtcmp_t *stats;
if ( x==0 )
{
n += stats[i].m[j] + stats[i].mm[j];
nrd_m[j] += stats[i].m[j];
nrd_mm[j] += stats[i].mm[j];
printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
stats = args->af_gts_snps;
}
if ( !i || !n ) continue; // skip singleton stats and empty bins
printf("GCsAF\t2\t%f", 100.*(i-1)/(args->m_af-1));
printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]);
printf("\t%d\t%d\t%d", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]);
printf("\t%f\t%d\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
else
{
printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
stats = args->af_gts_indels;
}
int nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
for (i=0; i<args->m_af; i++)
{
int j, n = 0;
for (j=0; j<3; j++)
{
n += stats[i].m[j] + stats[i].mm[j];
nrd_m[j] += stats[i].m[j];
nrd_mm[j] += stats[i].mm[j];
}
if ( !i || !n ) continue; // skip singleton stats and empty bins
printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]);
printf("\t%d\t%d\t%d", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]);
printf("\t%f\t%d\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
}

if ( x==0 )
printf("# Non-Reference Discordance (NRD), SNPs\n# NRDs\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
else
printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
int m = nrd_m[GT_HET_RA] + nrd_m[GT_HOM_AA];
int mm = nrd_mm[GT_HOM_RR] + nrd_mm[GT_HET_RA] + nrd_mm[GT_HOM_AA];
printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR] ? nrd_mm[GT_HOM_RR]*100.0/(nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR]) : 0,
nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA] ? nrd_mm[GT_HET_RA]*100.0/(nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA]) : 0,
nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA] ? nrd_mm[GT_HOM_AA]*100.0/(nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA]) : 0
);
}

int m = nrd_m[GT_HET_RA] + nrd_m[GT_HOM_AA];
int mm = nrd_mm[GT_HOM_RR] + nrd_mm[GT_HET_RA] + nrd_mm[GT_HOM_AA];
printf("# Non-Reference Discordance (NRD)\n# NRD\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
printf("NRD\t2\t%f\t%f\t%f\t%f\n",
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR] ? nrd_mm[GT_HOM_RR]*100.0/(nrd_m[GT_HOM_RR]+nrd_mm[GT_HOM_RR]) : 0,
nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA] ? nrd_mm[GT_HET_RA]*100.0/(nrd_m[GT_HET_RA]+nrd_mm[GT_HET_RA]) : 0,
nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA] ? nrd_mm[GT_HOM_AA]*100.0/(nrd_m[GT_HOM_AA]+nrd_mm[GT_HOM_AA]) : 0
);

printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n");
stats = args->smpl_gts_snps;
for (i=0; i<args->files->n_smpl; i++)
for (x=0; x<2; x++)
{
int m = stats[i].m[GT_HET_RA] + stats[i].m[GT_HOM_AA];
int mm = stats[i].mm[GT_HOM_RR] + stats[i].mm[GT_HET_RA] + stats[i].mm[GT_HOM_AA];
printf("GCsS\t2\t%s\t%.3f", args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]);
printf("\t%d\t%d\t%d\n", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]);
gtcmp_t *stats;
if ( x==0 )
{
printf("# GCcS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n");
stats = args->smpl_gts_snps;
}
else
{
printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\n");
stats = args->smpl_gts_indels;
}
for (i=0; i<args->files->n_smpl; i++)
{
int m = stats[i].m[GT_HET_RA] + stats[i].m[GT_HOM_AA];
int mm = stats[i].mm[GT_HOM_RR] + stats[i].mm[GT_HET_RA] + stats[i].mm[GT_HOM_AA];
printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
printf("\t%d\t%d\t%d", stats[i].m[GT_HOM_RR],stats[i].m[GT_HET_RA],stats[i].m[GT_HOM_AA]);
printf("\t%d\t%d\t%d\n", stats[i].mm[GT_HOM_RR],stats[i].mm[GT_HET_RA],stats[i].mm[GT_HOM_AA]);
}
}
}

Expand Down

0 comments on commit 20d1e6e

Please sign in to comment.