Skip to content

Commit

Permalink
Merge pull request samtools#299 from samtools/feature/stats
Browse files Browse the repository at this point in the history
update stats to match upstream
  • Loading branch information
mcshane committed Jul 28, 2015
2 parents a04ddb7 + 255ee9a commit 5d52a36
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 28 deletions.
12 changes: 10 additions & 2 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1439,7 +1439,11 @@ columns.
*--debug*::
produce verbose per-site and per-sample output

*-e, --exons* 'file.gz'::
*-e, --exclude* 'EXPRESSION'::
exclude sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.

*-E, --exons* 'file.gz'::
tab-delimited file with exons for indel frameshifts statistics. The columns
of the file are CHR, FROM, TO, with 1-based, inclusive, positions. The file
is BGZF-compressed and indexed with tabix
Expand All @@ -1453,7 +1457,11 @@ columns.
*-F, --fasta-ref* 'ref.fa'::
faidx indexed reference sequence file to determine INDEL context

*-i, --split-by-ID*::
*-i, --include* 'EXPRESSION'::
include only sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.

*-I, --split-by-ID*::
collect stats separately for sites which have the ID column set ("known
sites") or which do not have the ID column set ("novel sites").

Expand Down
9 changes: 7 additions & 2 deletions test/check.chk
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# SN [2]id [3]key [4]value
SN 0 number of samples: 2
SN 0 number of records: 18
SN 0 number of no-ALTs: 1
SN 0 number of SNPs: 5
SN 0 number of MNPs: 1
SN 0 number of indels: 9
Expand Down Expand Up @@ -70,8 +71,12 @@ DP 0 32 4 11.111111 0 0.000000
DP 0 35 4 11.111111 0 0.000000
# PSC, Per-sample counts
# PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons
PSC 0 A 0 2 3 3 2 9 28.7 1
PSC 0 B 1 1 3 2 2 9 28.7 0
PSC 0 A 0 2 4 3 2 9 28.7 1
PSC 0 B 1 1 4 2 2 9 28.7 0
# PSI, Per-Sample Indels
# PSI [2]id [3]sample [4]in-frame [5]out-frame [6]not applicable [7]out/(in+out) ratio [8]nHets [9]nAA
PSI 0 A 0 0 0 0.00 9 0
PSI 0 B 0 0 0 0.00 9 0
# HWE
# HWE [2]id [3]1st ALT allele frequency [4]Number of observations [5]25th percentile [6]median [7]75th percentile
HWE 0 0.000000 2 0.490000 0.490000 0.990000
Expand Down
12 changes: 12 additions & 0 deletions test/stats.chk
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
SN 0 number of samples: 3
SN 1 number of samples: 3
SN 0 number of records: 0
SN 0 number of no-ALTs: 0
SN 0 number of SNPs: 0
SN 0 number of MNPs: 0
SN 0 number of indels: 0
SN 0 number of others: 0
SN 0 number of multiallelic sites: 0
SN 0 number of multiallelic SNP sites: 0
SN 1 number of records: 0
SN 1 number of no-ALTs: 0
SN 1 number of SNPs: 0
SN 1 number of MNPs: 0
SN 1 number of indels: 0
SN 1 number of others: 0
SN 1 number of multiallelic sites: 0
SN 1 number of multiallelic SNP sites: 0
SN 2 number of records: 3
SN 2 number of no-ALTs: 0
SN 2 number of SNPs: 3
SN 2 number of MNPs: 0
SN 2 number of indels: 0
Expand Down Expand Up @@ -84,4 +87,13 @@ PSC 1 C 0 0 0 0 0 0 0.0 0
PSC 2 A 3 0 0 0 0 0 0.0 0
PSC 2 B 0 0 3 3 0 0 0.0 0
PSC 2 C 0 3 0 3 0 0 0.0 0
PSI 0 A 0 0 0 0.00 0 0
PSI 0 B 0 0 0 0.00 0 0
PSI 0 C 0 0 0 0.00 0 0
PSI 1 A 0 0 0 0.00 0 0
PSI 1 B 0 0 0 0.00 0 0
PSI 1 C 0 0 0 0.00 0 0
PSI 2 A 0 0 0 0.00 0 0
PSI 2 B 0 0 0 0.00 0 0
PSI 2 C 0 0 0 0.00 0 0
HWE 2 49.000000 3 0.330000 0.330000 0.330000
100 changes: 76 additions & 24 deletions vcfstats.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
Copyright (C) 2012-2014 Genome Research Ltd.
Copyright (C) 2012-2015 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -38,6 +38,11 @@ THE SOFTWARE. */
#include <htslib/faidx.h>
#include <inttypes.h>
#include "bcftools.h"
#include "filter.h"

// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2

#define HWE_STATS 1
#define QUAL_STATS 1
Expand Down Expand Up @@ -75,7 +80,7 @@ smpl_r_t;

typedef struct
{
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records;
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons
#if HWE_STATS
int *af_hwe;
Expand All @@ -92,6 +97,7 @@ typedef struct
int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1;
int subst[15];
int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl;
int *smpl_indel_hets, *smpl_indel_homs;
int *smpl_frm_shifts; // not-applicable, in-frame, out-frame
unsigned long int *smpl_dp;
idist_t dp, dp_sites;
Expand Down Expand Up @@ -145,6 +151,11 @@ typedef struct
char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
int argc, verbose_sites, first_allele_only, samples_is_file;
int split_by_id, nstats;

filter_t *filter[2];
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE

// Per Sample r working data arrays of size equal to number of samples
smpl_r_t* smpl_r_snps;
smpl_r_t* smpl_r_indels;
Expand Down Expand Up @@ -387,6 +398,13 @@ static void init_stats(args_t *args)
args->nstats = args->files->nreaders==1 ? 1 : 3;
if ( args->split_by_id ) args->nstats = 2;

if ( args->filter_str )
{
args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
if ( args->files->nreaders==2 )
args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
}

// AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
args->m_af = 101;
for (i=0; i<args->files->nreaders; i++)
Expand Down Expand Up @@ -437,6 +455,8 @@ static void init_stats(args_t *args)
stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int));
stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int));
Expand Down Expand Up @@ -514,6 +534,8 @@ static void destroy_stats(args_t *args)
if (stats->smpl_hets) free(stats->smpl_hets);
if (stats->smpl_homAA) free(stats->smpl_homAA);
if (stats->smpl_homRR) free(stats->smpl_homRR);
if (stats->smpl_indel_homs) free(stats->smpl_indel_homs);
if (stats->smpl_indel_hets) free(stats->smpl_indel_hets);
if (stats->smpl_ts) free(stats->smpl_ts);
if (stats->smpl_tv) free(stats->smpl_tv);
if (stats->smpl_indels) free(stats->smpl_indels);
Expand Down Expand Up @@ -543,6 +565,8 @@ static void destroy_stats(args_t *args)
free(args->smpl_r_snps);
free(args->smpl_r_indels);
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
if (args->filter[0]) filter_destroy(args->filter[0]);
if (args->filter[1]) filter_destroy(args->filter[1]);
}

static void init_iaf(args_t *args, bcf_sr_t *reader)
Expand Down Expand Up @@ -809,7 +833,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
case GT_HOM_AA: nalt_tot++; break;
}
#endif
if ( line_type&VCF_SNP )
if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP
{
if ( gt == GT_HET_RA ) stats->smpl_hets[is]++;
else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++;
Expand All @@ -827,7 +851,12 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int
}
if ( line_type&VCF_INDEL )
{
if ( gt != GT_HOM_RR ) stats->smpl_indels[is]++;
if ( gt != GT_HOM_RR )
{
stats->smpl_indels[is]++;
if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++;
else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++;
}
if ( stats->smpl_frm_shifts )
{
assert( ial<line->n_allele && jal<line->n_allele );
Expand Down Expand Up @@ -1001,15 +1030,26 @@ static void do_vcf_stats(args_t *args)
{
bcf_sr_t *reader = NULL;
bcf1_t *line = NULL;
int ret = 0, i;
int ret = 0, i, pass = 1;
for (i=0; i<files->nreaders; i++)
{
if ( !bcf_sr_has_line(files,i) ) continue;
if ( args->filter[i] )
{
int is_ok = filter_test(args->filter[i], bcf_sr_get_line(files,i), NULL);
if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
if ( !is_ok ) { pass = 0; break; }
}
ret |= 1<<i;
if ( reader ) continue;
reader = &files->readers[i];
line = files->readers[i].buffer[0];
if ( !reader )
{
reader = &files->readers[i];
line = bcf_sr_get_line(files,i);
}

}
if ( !pass ) continue;

int line_type = bcf_get_variant_types(line);
init_iaf(args, reader);

Expand All @@ -1019,6 +1059,8 @@ static void do_vcf_stats(args_t *args)

stats->n_records++;

if ( line_type==VCF_REF )
stats->n_noalts++;
if ( line_type&VCF_SNP )
do_snp_stats(args, stats, reader);
if ( line_type&VCF_INDEL )
Expand Down Expand Up @@ -1095,6 +1137,7 @@ static void print_stats(args_t *args)
{
stats_t *stats = &args->stats[id];
printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records);
printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts);
printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps);
printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps);
printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels);
Expand Down Expand Up @@ -1354,19 +1397,22 @@ static void print_stats(args_t *args)
}


if ( args->exons )
printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n");
for (id=0; id<args->nstats; id++)
{
printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\n");
for (id=0; id<args->nstats; id++)
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
{
stats_t *stats = &args->stats[id];
for (i=0; i<args->files->n_smpl; i++)
int na = 0, in = 0, out = 0;
if ( args->exons )
{
int na = stats->smpl_frm_shifts[i*3 + 0];
int in = stats->smpl_frm_shifts[i*3 + 1];
int out = stats->smpl_frm_shifts[i*3 + 2];
printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0);
na = stats->smpl_frm_shifts[i*3 + 0];
in = stats->smpl_frm_shifts[i*3 + 1];
out = stats->smpl_frm_shifts[i*3 + 2];
}
int nhom = stats->smpl_indel_homs[i];
int nhet = stats->smpl_indel_hets[i];
printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom);
}
}

Expand Down Expand Up @@ -1425,10 +1471,12 @@ static void usage(void)
fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
fprintf(stderr, " -e, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -E, --exons <file.gz> tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
fprintf(stderr, " -i, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -s, --samples <list> list of samples for sample stats, \"-\" to include all samples\n");
Expand All @@ -1453,24 +1501,26 @@ int main_vcfstats(int argc, char *argv[])
static struct option loptions[] =
{
{"1st-allele-only",0,0,'1'},
{"include",1,0,'i'},
{"exclude",1,0,'e'},
{"help",0,0,'h'},
{"collapse",1,0,'c'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"verbose",0,0,'v'},
{"depth",1,0,'d'},
{"apply-filters",1,0,'f'},
{"exons",1,0,'e'},
{"exons",1,0,'E'},
{"samples",1,0,'s'},
{"samples-file",1,0,'S'},
{"split-by-ID",0,0,'i'},
{"split-by-ID",0,0,'I'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:it:T:F:f:1u:v",loptions,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
switch (c) {
case 'u': add_user_stats(args,optarg); break;
case '1': args->first_allele_only = 1; break;
Expand All @@ -1497,10 +1547,12 @@ int main_vcfstats(int argc, char *argv[])
case 'f': args->files->apply_filters = optarg; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 'e': args->exons_fname = optarg; break;
case 'E': args->exons_fname = optarg; break;
case 's': args->samples_list = optarg; break;
case 'S': args->samples_list = optarg; args->samples_is_file = 1; break;
case 'i': args->split_by_id = 1; break;
case 'I': args->split_by_id = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
Expand Down

0 comments on commit 5d52a36

Please sign in to comment.