Skip to content

Commit

Permalink
Allow duplicate positions in both VCF and the TAB file with `call -C …
Browse files Browse the repository at this point in the history
…alleles`. Fixes samtools#915
  • Loading branch information
pd3 committed Apr 18, 2019
1 parent 64a5e1d commit c76dbcf
Show file tree
Hide file tree
Showing 12 changed files with 293 additions and 89 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ bam_sample_h = bam_sample.h $(htslib_sam_h)
main.o: main.c $(htslib_hts_h) config.h version.h $(bcftools_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) $(htslib_khash_h)
vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h) $(vcfbuf_h)
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
Expand Down
2 changes: 2 additions & 0 deletions call.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ typedef struct
}
family_t;


// For the `-C alleles -i` constrained calling
typedef struct
{
uint32_t n:31, used:1;
Expand Down
31 changes: 31 additions & 0 deletions test/mpileup.cals.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=hs37d5.fa
##contig=<ID=6,length=171115067>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=IDV,Number=1,Type=Integer,Description="Maximum number of raw reads supporting an indel">
##INFO=<ID=IMF,Number=1,Type=Float,Description="Maximum fraction of raw reads supporting an indel">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)",Version="3">
##INFO=<ID=RPB,Number=1,Type=Float,Description="Mann-Whitney U test of Read Position Bias (bigger is better)">
##INFO=<ID=MQB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality Bias (bigger is better)">
##INFO=<ID=BQB,Number=1,Type=Float,Description="Mann-Whitney U test of Base Quality Bias (bigger is better)">
##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths">
##FORMAT=<ID=ADF,Number=R,Type=Integer,Description="Allelic depths on the forward strand">
##FORMAT=<ID=ADR,Number=R,Type=Integer,Description="Allelic depths on the reverse strand">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=ICB,Number=1,Type=Float,Description="Inbreeding Coefficient Binomial test (bigger is better)">
##INFO=<ID=HOB,Number=1,Type=Float,Description="Bias in the number of HOMs number (smaller is better)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
6 263708 . CTT C,CT 280 . DP=211;MQSB=0.901445;MQ0F=0;AC=0,0;AN=2;DP4=117,27,0,0;MQ=57 GT:PL:DP:SP:ADF:ADR:AD 0/0:0,255,255,255,255,255:144:0:117,0,0:27,0,0:144,0,0
1 change: 1 addition & 0 deletions test/mpileup.cals.1.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
6 263708 CTT,C,CT
26 changes: 26 additions & 0 deletions test/mpileup.cals.1.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
##fileformat=VCFv4.2
##reference=hs37d5.fa
##contig=<ID=6,length=171115067>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=IDV,Number=1,Type=Integer,Description="Maximum number of raw reads supporting an indel">
##INFO=<ID=IMF,Number=1,Type=Float,Description="Maximum fraction of raw reads supporting an indel">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)",Version="3">
##INFO=<ID=RPB,Number=1,Type=Float,Description="Mann-Whitney U test of Read Position Bias (bigger is better)">
##INFO=<ID=MQB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality Bias (bigger is better)">
##INFO=<ID=BQB,Number=1,Type=Float,Description="Mann-Whitney U test of Base Quality Bias (bigger is better)">
##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths">
##FORMAT=<ID=ADF,Number=R,Type=Integer,Description="Allelic depths on the forward strand">
##FORMAT=<ID=ADR,Number=R,Type=Integer,Description="Allelic depths on the reverse strand">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
6 263708 . C <*> 0 . DP=211;I16=117,27,0,0,4635,154411,0,0,8284,486348,0,0,2730,59520,0,0;QS=1,0;MQSB=0.901445;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,255,255:144:0:117,0:27,0:144,0
6 263708 . CTTTTTTTTTTTTTT CTTTTTTTTTTTTT,CTTTTTTTTTTTTTTT,CTTTTTTTTTTTT 0 . INDEL;IDV=27;IMF=0.127358;DP=212;I16=100,83,20,9,3133,71279,800,23372,10402,608546,1663,97179,3447,74011,628,14310;QS=0.84686,0.0839802,0.0494001,0.0197601;VDB=0.0105186;SGB=-0.693079;MQSB=0.971475;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,255,39,255,51,39,255,74,57,39:212:8:100,12,7,1:83,5,3,1:183,17,10,2
22 changes: 22 additions & 0 deletions test/mpileup.cals.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file://grch37.chr20.fa.gz
##contig=<ID=1,length=249250621>
##contig=<ID=16,length=249250621>
##contig=<ID=17,length=249250621>
##contig=<ID=20,length=249250621>
##contig=<ID=21,length=249250621>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=ICB,Number=1,Type=Float,Description="Inbreeding Coefficient Binomial test (bigger is better)">
##INFO=<ID=HOB,Number=1,Type=Float,Description="Bias in the number of HOMs number (smaller is better)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample
1 1368833 . T C 39.5768 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,13
1 1368833 . TAAAAAAAAAAAAAAAA TAAAAAAAAAAAAAA 24.1741 . DP=2;AC=0;AN=2;DP4=0,1,1,0;MQ=60 GT:PL 0/0:6,0,6
18 changes: 18 additions & 0 deletions test/mpileup.cals.2.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
1 1368828 GT,G
1 1368833 TAA,T
1 1368833 T,C
1 1368833 T,G
1 1368834 A,T
16 60288 C,A
17 355 G,A
20 58799 T,C
20 58799 T,C
20 68800 A,G
20 68810 G,A
20 69066 C,G
20 69094 G,A
20 69408 C,T
21 9411409 T,C
21 9411485 C,A
21 9411497 A,G
21 9412485 C,G
17 changes: 17 additions & 0 deletions test/mpileup.cals.2.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file://grch37.chr20.fa.gz
##contig=<ID=1,length=249250621>
##contig=<ID=16,length=249250621>
##contig=<ID=17,length=249250621>
##contig=<ID=20,length=249250621>
##contig=<ID=21,length=249250621>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample
1 1368833 . T <*> 0 . DP=1;I16=0,1,0,0,13,169,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,13
1 1368833 . TAAAAAAAAAAAAAAAA TAAAAAAAAAAAAAA 0 . DP=2;I16=0,1,1,0,10,100,30,900,60,3600,60,3600,3,9,25,625;QS=0.5,0.5 PL 6,0,6
2 changes: 2 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@
test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.5.out',tab=>'mpileup.5',args=>'-i');
test_vcf_call_cAls($opts,in=>'mpileup.4',out=>'mpileup.cAls.6.out',tab=>'mpileup.6',args=>'-i');
test_vcf_call_cAls($opts,in=>'mpileup.5',out=>'mpileup.cAls.7.out',tab=>'mpileup.7',args=>'-i');
test_vcf_call_cAls($opts,in=>'mpileup.cals.1',out=>'mpileup.cals.1.out',tab=>'mpileup.cals.1',args=>'');
test_vcf_call_cAls($opts,in=>'mpileup.cals.2',out=>'mpileup.cals.2.out',tab=>'mpileup.cals.2',args=>'');
test_vcf_call($opts,in=>'mpileup.c',out=>'mpileup.c.1.out',args=>'-cv');
# test_vcf_call($opts,in=>'mpileup.c',out=>'mpileup.c.2.out',args=>'-cg0');
test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.samples');
Expand Down
15 changes: 15 additions & 0 deletions vcfbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,21 @@ bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
return ret;
}

bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx)
{
int i = rbuf_kth(&buf->rbuf, idx);
return i<0 ? NULL : buf->vcf[i].rec;
}

bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx)
{
int i = rbuf_kth(&buf->rbuf, idx);
if ( i<0 ) return NULL;
bcf1_t *rec = buf->vcf[i].rec;
rbuf_remove_kth(&buf->rbuf, vcfrec_t, idx, buf->vcf);
return rec;
}

static int cmpvrec(const void *_a, const void *_b)
{
vcfrec_t *a = *((vcfrec_t**) _a);
Expand Down
12 changes: 12 additions & 0 deletions vcfbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ void vcfbuf_destroy(vcfbuf_t *buf);
*/
bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap);

/*
* vcfbuf_peek() - return pointer to i-th record in the buffer but do not remove it from the buffer
* @idx: 0-based index to buffered lines
*/
bcf1_t *vcfbuf_peek(vcfbuf_t *buf, int idx);

/*
* vcfbuf_remove() - return pointer to i-th record in the buffer and remove it from the buffer
* @idx: 0-based index to buffered lines
*/
bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx);

bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all);

/*
Expand Down
Loading

0 comments on commit c76dbcf

Please sign in to comment.