From 5e1e930cebbe2b44f03ae16d44f8575869c96f01 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 9 Mar 2015 21:53:01 +0000 Subject: [PATCH 1/3] stats: previously omitted ALT=. sites now count as SNPs in per-sample stats (HomRR counts) --- test/check.chk | 5 +++-- test/stats.chk | 3 +++ vcfstats.c | 7 +++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/check.chk b/test/check.chk index 1012e5c81..4ad6cf806 100644 --- a/test/check.chk +++ b/test/check.chk @@ -5,6 +5,7 @@ # SN [2]id [3]key [4]value SN 0 number of samples: 2 SN 0 number of records: 18 +SN 0 number of no-ALTs: 1 SN 0 number of SNPs: 5 SN 0 number of MNPs: 1 SN 0 number of indels: 9 @@ -70,8 +71,8 @@ DP 0 32 4 11.111111 0 0.000000 DP 0 35 4 11.111111 0 0.000000 # PSC, Per-sample counts # PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons -PSC 0 A 0 2 3 3 2 9 28.7 1 -PSC 0 B 1 1 3 2 2 9 28.7 0 +PSC 0 A 0 2 4 3 2 9 28.7 1 +PSC 0 B 1 1 4 2 2 9 28.7 0 # HWE # HWE [2]id [3]1st ALT allele frequency [4]Number of observations [5]25th percentile [6]median [7]75th percentile HWE 0 0.000000 2 0.490000 0.490000 0.990000 diff --git a/test/stats.chk b/test/stats.chk index ea87209c1..6f603866d 100644 --- a/test/stats.chk +++ b/test/stats.chk @@ -1,6 +1,7 @@ SN 0 number of samples: 3 SN 1 number of samples: 3 SN 0 number of records: 0 +SN 0 number of no-ALTs: 0 SN 0 number of SNPs: 0 SN 0 number of MNPs: 0 SN 0 number of indels: 0 @@ -8,6 +9,7 @@ SN 0 number of others: 0 SN 0 number of multiallelic sites: 0 SN 0 number of multiallelic SNP sites: 0 SN 1 number of records: 0 +SN 1 number of no-ALTs: 0 SN 1 number of SNPs: 0 SN 1 number of MNPs: 0 SN 1 number of indels: 0 @@ -15,6 +17,7 @@ SN 1 number of others: 0 SN 1 number of multiallelic sites: 0 SN 1 number of multiallelic SNP sites: 0 SN 2 number of records: 3 +SN 2 number of no-ALTs: 0 SN 2 number of SNPs: 3 SN 2 number of MNPs: 0 SN 2 number of indels: 0 diff --git a/vcfstats.c b/vcfstats.c index e0a0c2c83..8892f25fd 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -75,7 +75,7 @@ smpl_r_t; typedef struct { - int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records; + int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons #if HWE_STATS int *af_hwe; @@ -809,7 +809,7 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int case GT_HOM_AA: nalt_tot++; break; } #endif - if ( line_type&VCF_SNP ) + if ( line_type&VCF_SNP || line_type==VCF_REF ) // count ALT=. as SNP { if ( gt == GT_HET_RA ) stats->smpl_hets[is]++; else if ( gt == GT_HET_AA ) stats->smpl_hets[is]++; @@ -1019,6 +1019,8 @@ static void do_vcf_stats(args_t *args) stats->n_records++; + if ( line_type==VCF_REF ) + stats->n_noalts++; if ( line_type&VCF_SNP ) do_snp_stats(args, stats, reader); if ( line_type&VCF_INDEL ) @@ -1095,6 +1097,7 @@ static void print_stats(args_t *args) { stats_t *stats = &args->stats[id]; printf("SN\t%d\tnumber of records:\t%d\n", id, stats->n_records); + printf("SN\t%d\tnumber of no-ALTs:\t%d\n", id, stats->n_noalts); printf("SN\t%d\tnumber of SNPs:\t%d\n", id, stats->n_snps); printf("SN\t%d\tnumber of MNPs:\t%d\n", id, stats->n_mnps); printf("SN\t%d\tnumber of indels:\t%d\n", id, stats->n_indels); From 6fed5ac7a68a2cd0bc2121b99bcfbb3eacbec3e5 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 15 May 2015 09:35:43 +0100 Subject: [PATCH 2/3] stats: output also indel nHET and nAA in PSI stats --- test/check.chk | 4 ++++ test/stats.chk | 9 +++++++++ vcfstats.c | 33 +++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/test/check.chk b/test/check.chk index 4ad6cf806..0f78e654e 100644 --- a/test/check.chk +++ b/test/check.chk @@ -73,6 +73,10 @@ DP 0 35 4 11.111111 0 0.000000 # PSC [2]id [3]sample [4]nRefHom [5]nNonRefHom [6]nHets [7]nTransitions [8]nTransversions [9]nIndels [10]average depth [11]nSingletons PSC 0 A 0 2 4 3 2 9 28.7 1 PSC 0 B 1 1 4 2 2 9 28.7 0 +# PSI, Per-Sample Indels +# PSI [2]id [3]sample [4]in-frame [5]out-frame [6]not applicable [7]out/(in+out) ratio [8]nHets [9]nAA +PSI 0 A 0 0 0 0.00 9 0 +PSI 0 B 0 0 0 0.00 9 0 # HWE # HWE [2]id [3]1st ALT allele frequency [4]Number of observations [5]25th percentile [6]median [7]75th percentile HWE 0 0.000000 2 0.490000 0.490000 0.990000 diff --git a/test/stats.chk b/test/stats.chk index 6f603866d..6b2dd3be2 100644 --- a/test/stats.chk +++ b/test/stats.chk @@ -87,4 +87,13 @@ PSC 1 C 0 0 0 0 0 0 0.0 0 PSC 2 A 3 0 0 0 0 0 0.0 0 PSC 2 B 0 0 3 3 0 0 0.0 0 PSC 2 C 0 3 0 3 0 0 0.0 0 +PSI 0 A 0 0 0 0.00 0 0 +PSI 0 B 0 0 0 0.00 0 0 +PSI 0 C 0 0 0 0.00 0 0 +PSI 1 A 0 0 0 0.00 0 0 +PSI 1 B 0 0 0 0.00 0 0 +PSI 1 C 0 0 0 0.00 0 0 +PSI 2 A 0 0 0 0.00 0 0 +PSI 2 B 0 0 0 0.00 0 0 +PSI 2 C 0 0 0 0.00 0 0 HWE 2 49.000000 3 0.330000 0.330000 0.330000 diff --git a/vcfstats.c b/vcfstats.c index 8892f25fd..a207ccc53 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -92,6 +92,7 @@ typedef struct int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; int subst[15]; int *smpl_hets, *smpl_homRR, *smpl_homAA, *smpl_ts, *smpl_tv, *smpl_indels, *smpl_ndp, *smpl_sngl; + int *smpl_indel_hets, *smpl_indel_homs; int *smpl_frm_shifts; // not-applicable, in-frame, out-frame unsigned long int *smpl_dp; idist_t dp, dp_sites; @@ -437,6 +438,8 @@ static void init_stats(args_t *args) stats->smpl_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homAA = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_homRR = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_indel_hets = (int *) calloc(args->files->n_smpl,sizeof(int)); + stats->smpl_indel_homs = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_ts = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_tv = (int *) calloc(args->files->n_smpl,sizeof(int)); stats->smpl_indels = (int *) calloc(args->files->n_smpl,sizeof(int)); @@ -514,6 +517,8 @@ static void destroy_stats(args_t *args) if (stats->smpl_hets) free(stats->smpl_hets); if (stats->smpl_homAA) free(stats->smpl_homAA); if (stats->smpl_homRR) free(stats->smpl_homRR); + if (stats->smpl_indel_homs) free(stats->smpl_indel_homs); + if (stats->smpl_indel_hets) free(stats->smpl_indel_hets); if (stats->smpl_ts) free(stats->smpl_ts); if (stats->smpl_tv) free(stats->smpl_tv); if (stats->smpl_indels) free(stats->smpl_indels); @@ -827,7 +832,12 @@ static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int } if ( line_type&VCF_INDEL ) { - if ( gt != GT_HOM_RR ) stats->smpl_indels[is]++; + if ( gt != GT_HOM_RR ) + { + stats->smpl_indels[is]++; + if ( gt==GT_HET_RA || gt==GT_HET_AA ) stats->smpl_indel_hets[is]++; + else if ( gt==GT_HOM_AA ) stats->smpl_indel_homs[is]++; + } if ( stats->smpl_frm_shifts ) { assert( ialn_allele && jaln_allele ); @@ -1357,19 +1367,22 @@ static void print_stats(args_t *args) } - if ( args->exons ) + printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\t[8]nHets\t[9]nAA\n"); + for (id=0; idnstats; id++) { - printf("# PSI, Per-Sample Indels\n# PSI\t[2]id\t[3]sample\t[4]in-frame\t[5]out-frame\t[6]not applicable\t[7]out/(in+out) ratio\n"); - for (id=0; idnstats; id++) + stats_t *stats = &args->stats[id]; + for (i=0; ifiles->n_smpl; i++) { - stats_t *stats = &args->stats[id]; - for (i=0; ifiles->n_smpl; i++) + int na = 0, in = 0, out = 0; + if ( args->exons ) { - int na = stats->smpl_frm_shifts[i*3 + 0]; - int in = stats->smpl_frm_shifts[i*3 + 1]; - int out = stats->smpl_frm_shifts[i*3 + 2]; - printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0); + na = stats->smpl_frm_shifts[i*3 + 0]; + in = stats->smpl_frm_shifts[i*3 + 1]; + out = stats->smpl_frm_shifts[i*3 + 2]; } + int nhom = stats->smpl_indel_homs[i]; + int nhet = stats->smpl_indel_hets[i]; + printf("PSI\t%d\t%s\t%d\t%d\t%d\t%.2f\t%d\t%d\n", id,args->files->samples[i], in,out,na,in+out?1.0*out/(in+out):0,nhet,nhom); } } From 255ee9a151cf7ca23ff6cf6f294f2d89dac48ea6 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 13 May 2015 10:16:50 +0100 Subject: [PATCH 3/3] Support for -i/-e filtering in vcfstats For consistency, changed existing -i/-e in vcfstats to -I/-E. [NEWS] * stats: -i/-e short options changed to -I/-E to accommodate the filtering -i/-e (--include/--exclude) options used in other tools --- doc/bcftools.txt | 12 ++++++++-- vcfstats.c | 60 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 574ac7ef5..027cb4b7e 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1439,7 +1439,11 @@ columns. *--debug*:: produce verbose per-site and per-sample output -*-e, --exons* 'file.gz':: +*-e, --exclude* 'EXPRESSION':: + exclude sites for which 'EXPRESSION' is true. For valid expressions see + *<>*. + +*-E, --exons* 'file.gz':: tab-delimited file with exons for indel frameshifts statistics. The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. The file is BGZF-compressed and indexed with tabix @@ -1453,7 +1457,11 @@ columns. *-F, --fasta-ref* 'ref.fa':: faidx indexed reference sequence file to determine INDEL context -*-i, --split-by-ID*:: +*-i, --include* 'EXPRESSION':: + include only sites for which 'EXPRESSION' is true. For valid expressions see + *<>*. + +*-I, --split-by-ID*:: collect stats separately for sites which have the ID column set ("known sites") or which do not have the ID column set ("novel sites"). diff --git a/vcfstats.c b/vcfstats.c index a207ccc53..578be7bce 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2015 Genome Research Ltd. Author: Petr Danecek @@ -38,6 +38,11 @@ THE SOFTWARE. */ #include #include #include "bcftools.h" +#include "filter.h" + +// Logic of the filters: include or exclude sites which match the filters? +#define FLT_INCLUDE 1 +#define FLT_EXCLUDE 2 #define HWE_STATS 1 #define QUAL_STATS 1 @@ -146,6 +151,11 @@ typedef struct char **argv, *exons_fname, *regions_list, *samples_list, *targets_list; int argc, verbose_sites, first_allele_only, samples_is_file; int split_by_id, nstats; + + filter_t *filter[2]; + char *filter_str; + int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE + // Per Sample r working data arrays of size equal to number of samples smpl_r_t* smpl_r_snps; smpl_r_t* smpl_r_indels; @@ -388,6 +398,13 @@ static void init_stats(args_t *args) args->nstats = args->files->nreaders==1 ? 1 : 3; if ( args->split_by_id ) args->nstats = 2; + if ( args->filter_str ) + { + args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str); + if ( args->files->nreaders==2 ) + args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str); + } + // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs args->m_af = 101; for (i=0; ifiles->nreaders; i++) @@ -548,6 +565,8 @@ static void destroy_stats(args_t *args) free(args->smpl_r_snps); free(args->smpl_r_indels); if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx); + if (args->filter[0]) filter_destroy(args->filter[0]); + if (args->filter[1]) filter_destroy(args->filter[1]); } static void init_iaf(args_t *args, bcf_sr_t *reader) @@ -1011,15 +1030,26 @@ static void do_vcf_stats(args_t *args) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; - int ret = 0, i; + int ret = 0, i, pass = 1; for (i=0; inreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; + if ( args->filter[i] ) + { + int is_ok = filter_test(args->filter[i], bcf_sr_get_line(files,i), NULL); + if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1; + if ( !is_ok ) { pass = 0; break; } + } ret |= 1<readers[i]; - line = files->readers[i].buffer[0]; + if ( !reader ) + { + reader = &files->readers[i]; + line = bcf_sr_get_line(files,i); + } + } + if ( !pass ) continue; + int line_type = bcf_get_variant_types(line); init_iaf(args, reader); @@ -1441,10 +1471,12 @@ static void usage(void) fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); fprintf(stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); fprintf(stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(stderr, " -e, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); + fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); fprintf(stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(stderr, " -i, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); + fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); @@ -1469,6 +1501,8 @@ int main_vcfstats(int argc, char *argv[]) static struct option loptions[] = { {"1st-allele-only",0,0,'1'}, + {"include",1,0,'i'}, + {"exclude",1,0,'e'}, {"help",0,0,'h'}, {"collapse",1,0,'c'}, {"regions",1,0,'r'}, @@ -1476,17 +1510,17 @@ int main_vcfstats(int argc, char *argv[]) {"verbose",0,0,'v'}, {"depth",1,0,'d'}, {"apply-filters",1,0,'f'}, - {"exons",1,0,'e'}, + {"exons",1,0,'E'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, - {"split-by-ID",0,0,'i'}, + {"split-by-ID",0,0,'I'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"fasta-ref",1,0,'F'}, {"user-tstv",1,0,'u'}, {0,0,0,0} }; - while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:it:T:F:f:1u:v",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) { switch (c) { case 'u': add_user_stats(args,optarg); break; case '1': args->first_allele_only = 1; break; @@ -1513,10 +1547,12 @@ int main_vcfstats(int argc, char *argv[]) case 'f': args->files->apply_filters = optarg; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'e': args->exons_fname = optarg; break; + case 'E': args->exons_fname = optarg; break; case 's': args->samples_list = optarg; break; case 'S': args->samples_list = optarg; args->samples_is_file = 1; break; - case 'i': args->split_by_id = 1; break; + case 'I': args->split_by_id = 1; break; + case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg);