Skip to content

Commit

Permalink
Make fill-tags work with large number of alleles. Fixes samtools#621
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Sep 19, 2018
1 parent d49d61f commit 20a170e
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 16 deletions.
35 changes: 19 additions & 16 deletions plugins/fill-tags.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <htslib/kseq.h>
#include <htslib/vcf.h>
#include <htslib/khash_str2int.h>
#include <htslib/kbitset.h>
#include "bcftools.h"

#define SET_AN (1<<0)
Expand Down Expand Up @@ -72,6 +73,7 @@ typedef struct
double *hwe_probs;
int mhwe_probs;
kstring_t str;
kbitset_t *bset;
}
args_t;

Expand Down Expand Up @@ -389,19 +391,17 @@ void calc_hwe(args_t *args, int nref, int nalt, int nhet, float *p_hwe, float *p
*p_hwe = prob;
}

static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, int als)
static inline void set_counts(pop_t *pop, int is_half, int is_hom, int is_hemi, kbitset_t *bset)
{
int ial;
for (ial=0; als; ial++)
kbitset_iter_t itr;
int i;
kbs_start(&itr);
while ((i = kbs_next(bset, &itr)) >= 0)
{
if ( als&1 )
{
if ( is_half ) pop->counts[ial].nac++;
else if ( !is_hom ) pop->counts[ial].nhet++;
else if ( !is_hemi ) pop->counts[ial].nhom += 2;
else pop->counts[ial].nhemi++;
}
als >>= 1;
if ( is_half ) pop->counts[i].nac++;
else if ( !is_hom ) pop->counts[i].nhet++;
else if ( !is_hemi ) pop->counts[i].nhom += 2;
else pop->counts[i].nhemi++;
}
pop->ns++;
}
Expand Down Expand Up @@ -429,14 +429,15 @@ bcf1_t *process(bcf1_t *rec)
for (i=0; i<args->npop; i++)
clean_counts(&args->pop[i], rec->n_allele);

assert( rec->n_allele < 8*sizeof(int) );
if ( kbs_resize(&args->bset, rec->n_allele) < 0 ) error("kbs_resize: failed to store %d bits\n", rec->n_allele);

#define BRANCH_INT(type_t,vector_end) \
{ \
for (i=0; i<nsmpl; i++) \
{ \
type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
int ial, als = 0, nals = 0, is_half, is_hom, is_hemi; \
int ial, nbits = 0, nals = 0, is_half, is_hom, is_hemi; \
kbs_clear(args->bset); \
for (ial=0; ial<fmt_gt->n; ial++) \
{ \
if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
Expand All @@ -446,10 +447,11 @@ bcf1_t *process(bcf1_t *rec)
\
if ( idx >= rec->n_allele ) \
error("Incorrect allele (\"%d\") in %s at %s:%d\n",idx,args->in_hdr->samples[i],bcf_seqname(args->in_hdr,rec),rec->pos+1); \
als |= (1<<idx); /* this breaks with too many alleles */ \
if ( !kbs_exists(args->bset, idx) ) nbits++; \
kbs_insert(args->bset, idx); \
} \
if ( nals==0 ) continue; /* missing genotype */ \
is_hom = als && !(als & (als-1)); /* only one bit is set */ \
is_hom = nbits==1 ? 1 : 0; /* only one bit is set for homs */ \
if ( nals!=ial ) \
{ \
if ( args->drop_missing ) is_hemi = 0, is_half = 1; \
Expand All @@ -458,7 +460,7 @@ bcf1_t *process(bcf1_t *rec)
else if ( nals==1 ) is_hemi = 1, is_half = 0; \
else is_hemi = 0, is_half = 0; \
pop_t **pop = &args->smpl2pop[i*(args->npop+1)]; \
while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,als); pop++; }\
while ( *pop ) { set_counts(*pop,is_half,is_hom,is_hemi,args->bset); pop++; } \
} \
}
switch (fmt_gt->type) {
Expand Down Expand Up @@ -650,6 +652,7 @@ void destroy(void)
free(args->pop[i].smpl);
free(args->pop[i].counts);
}
kbs_destroy(args->bset);
free(args->str.s);
free(args->pop);
free(args->smpl2pop);
Expand Down
9 changes: 9 additions & 0 deletions test/fill-tags.4.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file:///seq/references/1000Genomes-NCBI37.fasta
##contig=<ID=11,length=135006516>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
11 2343543 . A C,AA,AAA,AAAA,AAAAA,AAAAAA,AAAAAAA,AAAAAAAA,AAAAAAAAA,AAAAAAAAAA,AAAAAAAAAAA,AAAAAAAAAAAA,AAAAAAAAAAAAA,AAAAAAAAAAAAAA,AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 999 . AN=6;AC=1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1 GT 0/1 2/3 33/34
6 changes: 6 additions & 0 deletions test/many-alts.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.1
##reference=file:///seq/references/1000Genomes-NCBI37.fasta
##contig=<ID=11,length=135006516>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
11 2343543 . A C,AA,AAA,AAAA,AAAAA,AAAAAA,AAAAAAA,AAAAAAAA,AAAAAAAAA,AAAAAAAAAA,AAAAAAAAAAA,AAAAAAAAAAAA,AAAAAAAAAAAAA,AAAAAAAAAAAAAA,AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 999 . . GT 0/1 2/3 33/34
1 change: 1 addition & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@
test_vcf_plugin($opts,in=>'merge.a',out=>'fill-tags.out',cmd=>'+fill-tags --no-version',args=>'-- -t AN,AC,AC_Hom,AC_Het,AC_Hemi');
test_vcf_plugin($opts,in=>'view',out=>'fill-tags.2.out',cmd=>'+fill-tags --no-version',args=>'-- -t AC,AN,AF,MAF,NS');
test_vcf_plugin($opts,in=>'view',out=>'fill-tags.3.out',cmd=>'+fill-tags --no-version',args=>'-- -t AC -S {PATH}/fill-tags.3.smpl');
test_vcf_plugin($opts,in=>'many-alts',out=>'fill-tags.4.out',cmd=>'+fill-tags --no-version',args=>'-- -t AN,AC');
test_vcf_plugin($opts,in=>'fill-tags-hemi',out=>'fill-tags-hemi.1.out',cmd=>'+fill-tags --no-version');
test_vcf_plugin($opts,in=>'fill-tags-hemi',out=>'fill-tags-hemi.2.out',cmd=>'+fill-tags --no-version',args=>'-- -d');
test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.out',cmd=>'+GTisec',args=>' | grep -v bcftools');
Expand Down

0 comments on commit 20a170e

Please sign in to comment.