Skip to content

Commit

Permalink
isec supports -i/-e filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Oct 1, 2014
1 parent 258cd52 commit 802367c
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) vcmp.h $(HTSDIR)/htslib/khash.h
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(bcftools_h) rbuf.h
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
Expand Down
1 change: 1 addition & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
test_vcf_idxstats($opts,in=>'empty',args=>'-n',out=>'empty.idx_count.out');
test_vcf_check($opts,in=>'check',out=>'check.chk');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.out',args=>'-n =2');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.flt.out',args=>'-n =2 -i"STRLEN(REF)==2"');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.both.out',args=>'-n =2 -c both');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.any.out',args=>'-n =2 -c any');
test_vcf_isec($opts,in=>['isec.a','isec.b'],out=>'isec.ab.C.out',args=>'-C -c any');
Expand Down
118 changes: 103 additions & 15 deletions vcfisec.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,24 @@ THE SOFTWARE. */
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
#include "bcftools.h"
#include "filter.h"

#define OP_PLUS 1
#define OP_MINUS 2
#define OP_EQUAL 3
#define OP_VENN 4
#define OP_COMPLEMENT 5

// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2

typedef struct
{
int isec_op, isec_n, *write, iwrite, nwrite, output_type;
int nflt, *flt_logic;
filter_t **flt;
char **flt_expr;
bcf_srs_t *files;
FILE *fh_log, *fh_sites;
htsFile **fh_out;
Expand Down Expand Up @@ -155,6 +163,20 @@ void isec_vcf(args_t *args)
for (i=0; i<files->nreaders; i++)
{
if ( !bcf_sr_has_line(files,i) ) continue;

if ( args->nflt && args->flt[i] )
{
bcf1_t *rec = bcf_sr_get_line(files, i);
int pass = filter_test(args->flt[i], rec, NULL);
if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1;
if ( !pass )
{
files->has_line[i] = 0;
n--;
continue;
}
}

if ( !line )
{
line = files->readers[i].buffer[0];
Expand Down Expand Up @@ -228,14 +250,59 @@ void isec_vcf(args_t *args)
if ( out_fh ) hts_close(out_fh);
}

static void add_filter(args_t *args, char *expr, int logic)
{
args->nflt++;
args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
if ( expr[0]=='-' && expr[1]==0 )
{
args->flt_expr[args->nflt-1] = NULL;
args->flt[args->nflt-1] = NULL;
}
else
args->flt_expr[args->nflt-1] = expr;
args->flt_logic[args->nflt-1] = logic;
}

static void destroy_data(args_t *args);
static void init_data(args_t *args)
{
int i;
if ( args->nflt )
{
if ( args->nflt > 1 && args->nflt!=args->files->nreaders )
error("Error: expected either one -i/-e option or as many as there are input files\n");
if ( args->nflt < args->files->nreaders )
{
if ( !args->flt_expr[0] ) error("Error: useless use of -i/-e\n");
args->nflt = args->files->nreaders;
args->flt_expr = (char**) realloc(args->flt_expr,sizeof(char*)*args->nflt);
args->flt_logic = (int*) realloc(args->flt_logic,sizeof(int)*args->nflt);
args->flt = (filter_t**) realloc(args->flt,sizeof(filter_t*)*args->nflt);
for (i=1; i<args->nflt; i++)
{
args->flt_expr[i] = args->flt_expr[0];
args->flt_logic[i] = args->flt_logic[0];
args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
}
args->flt[0] = filter_init(args->files->readers[0].header,args->flt_expr[0]);
}
else
{
for (i=0; i<args->files->nreaders; i++)
{
if ( !args->flt_expr[i] ) continue;
args->flt[i] = filter_init(args->files->readers[i].header,args->flt_expr[i]);
}
}
}

// Which files to write: parse the string passed with -w
char *p = args->write_files;
while (p && *p)
{
int i;
if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int));
if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files);
if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files);
Expand Down Expand Up @@ -333,10 +400,22 @@ static void init_data(args_t *args)

static void destroy_data(args_t *args)
{
int i;
if ( args->nflt )
{
for (i=0; i<args->nflt; i++)
{
if ( !args->flt[i] ) continue;
filter_destroy(args->flt[i]);
}
free(args->flt_expr);
free(args->flt);
free(args->flt_logic);
}
if ( args->prefix )
{
fclose(args->fh_log);
int i, n = args->isec_op==OP_VENN ? 3 : args->files->nreaders;
int n = args->isec_op==OP_VENN ? 3 : args->files->nreaders;
for (i=0; i<n; i++)
{
if ( !args->fnames[i] ) continue;
Expand Down Expand Up @@ -366,23 +445,28 @@ static void usage(void)
fprintf(stderr, "Usage: bcftools isec [options] <A.vcf.gz> <B.vcf.gz> [...]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -n, --nfiles [+-=]<int> output positions present in this many (=), this many or more (+), or this many or fewer (-) files\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(stderr, " -C, --complement output positions present only in the first file but missing in the others\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
fprintf(stderr, " -i, --include <expr> include only sites for which the expression is true\n");
fprintf(stderr, " -n, --nfiles [+-=]<int> output positions present in this many (=), this many or more (+), or this many or fewer (-) files\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -p, --prefix <dir> if given, subset each of the input files accordingly, see also -w\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " -w, --write <list> list of files to write with -p given as 1-based indexes. By default, all files are written\n");
fprintf(stderr, "\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Create intersection and complements of two sets saving the output in dir/*\n");
fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir\n");
fprintf(stderr, "\n");
fprintf(stderr, " # Filter sites in A and B (but not in C) and create intersection\n");
fprintf(stderr, " bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e - A.vcf.gz B.vcf.gz C.vcf.gz -p dir\n");
fprintf(stderr, "\n");
fprintf(stderr, " # Extract and write records from A shared by both A and B using exact allele match\n");
fprintf(stderr, " bcftools isec A.vcf.gz B.vcf.gz -p dir -n =2 -w 1\n");
fprintf(stderr, "\n");
Expand All @@ -405,6 +489,8 @@ int main_vcfisec(int argc, char *argv[])
static struct option loptions[] =
{
{"help",0,0,'h'},
{"exclude",1,0,'e'},
{"include",1,0,'i'},
{"collapse",1,0,'c'},
{"complement",0,0,'C'},
{"apply-filters",1,0,'f'},
Expand All @@ -419,7 +505,7 @@ int main_vcfisec(int argc, char *argv[])
{"output-type",1,0,'O'},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:",loptions,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) {
switch (c) {
case 'o': args->output_fname = optarg; break;
case 'O':
Expand Down Expand Up @@ -449,6 +535,8 @@ int main_vcfisec(int argc, char *argv[])
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'p': args->prefix = optarg; break;
case 'w': args->write_files = optarg; break;
case 'i': add_filter(args, optarg, FLT_INCLUDE); break;
case 'e': add_filter(args, optarg, FLT_EXCLUDE); break;
case 'n':
{
char *p = optarg;
Expand Down

0 comments on commit 802367c

Please sign in to comment.