Skip to content

Commit

Permalink
call -C allels -i should not skip sites. Fixes samtools#913
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Nov 17, 2018
1 parent 6af271c commit 8457b56
Show file tree
Hide file tree
Showing 9 changed files with 475 additions and 35 deletions.
8 changes: 8 additions & 0 deletions call.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ typedef struct
}
family_t;

typedef struct
{
int n:31, used:1;
char **allele;
}
tgt_als_t;

typedef struct _ccall_t ccall_t;
typedef struct
{
Expand All @@ -73,6 +80,7 @@ typedef struct
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)
tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES

// ccall only
double indel_frac, min_perm_p, min_lrt;
Expand Down
20 changes: 10 additions & 10 deletions mcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -1272,28 +1272,28 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o
//
static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
bcf_sr_regions_t *tgt = call->srs->targets;
if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
hts_expand(char*,tgt->nals+1,call->nals,call->als);
assert( call->tgt_als->n );
if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n);
hts_expand(char*,call->tgt_als->n+1,call->nals,call->als);

int has_new = 0;

int i, j, nals = 1;
for (i=1; i<call->nals_map; i++) call->als_map[i] = -1;

if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], tgt->als[0]) < 0 )
error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,tgt->als[0],rec->d.allele[0]);
if ( vcmp_set_ref(call->vcmp, rec->d.allele[0], call->tgt_als->allele[0]) < 0 )
error("The reference alleles are not compatible at %s:%d .. %s vs %s\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1,call->tgt_als->allele[0],rec->d.allele[0]);

// create mapping from new to old alleles
call->als[0] = tgt->als[0];
call->als[0] = call->tgt_als->allele[0];
call->als_map[0] = 0;

for (i=1; i<tgt->nals; i++)
for (i=1; i<call->tgt_als->n; i++)
{
call->als[nals] = tgt->als[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
call->als[nals] = call->tgt_als->allele[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);

if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; }
if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",call->tgt_als->allele[i]); return -1; }

if ( j>=0 )
{
Expand Down
4 changes: 4 additions & 0 deletions test/mpileup.3.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
20 69066 C,G
20 69093 G,A
20 69094 G,A
20 69408 C,T
275 changes: 275 additions & 0 deletions test/mpileup.3.vcf

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions test/mpileup.4.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
20 69066 C,G
20 69076 A,G
20 69093 G,A
20 69094 G,A
20 69408 C,T
20 changes: 20 additions & 0 deletions test/mpileup.cAls.3.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file://grch37.chr20.fa.gz
##contig=<ID=20,length=4200>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=ICB,Number=1,Type=Float,Description="Inbreeding Coefficient Binomial test (bigger is better)">
##INFO=<ID=HOB,Number=1,Type=Float,Description="Bias in the number of HOMs number (smaller is better)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample
20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33
20 69093 . G A . . . GT .
20 69094 . G A . . . GT .
20 69408 . C T . . . GT .
21 changes: 21 additions & 0 deletions test/mpileup.cAls.4.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file://grch37.chr20.fa.gz
##contig=<ID=20,length=4200>
##ALT=<ID=*,Description="Represents allele(s) other than observed.">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=ICB,Number=1,Type=Float,Description="Inbreeding Coefficient Binomial test (bigger is better)">
##INFO=<ID=HOB,Number=1,Type=Float,Description="Bias in the number of HOMs number (smaller is better)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample
20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33
20 69076 . A G 55 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,28
20 69093 . G A . . . GT .
20 69094 . G A . . . GT .
20 69408 . C T . . . GT .
7 changes: 5 additions & 2 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@
test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.2.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples');
test_vcf_call_cAls($opts,in=>'mpileup',out=>'mpileup.cAls.out',tab=>'mpileup');
test_vcf_call_cAls($opts,in=>'mpileup.2',out=>'mpileup.cAls.2.out',tab=>'mpileup.2');
test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.3.out',tab=>'mpileup.3',args=>'-i');
test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.4.out',tab=>'mpileup.4',args=>'-i');
test_vcf_call($opts,in=>'mpileup.c',out=>'mpileup.c.1.out',args=>'-cv');
# test_vcf_call($opts,in=>'mpileup.c',out=>'mpileup.c.2.out',args=>'-cg0');
test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.samples');
Expand Down Expand Up @@ -793,9 +795,10 @@ sub test_vcf_call
sub test_vcf_call_cAls
{
my ($opts,%args) = @_;
my $args = exists($args{args}) ? $args{args} : '';
bgzip_tabix($opts,file=>$args{tab},suffix=>'tab',args=>'-s1 -b2 -e2');
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call --no-version -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf 2>/dev/null");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $$opts{path}/$args{in}.vcf 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call --no-version -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $args $$opts{path}/$args{in}.vcf 2>/dev/null");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call -Ob -mA -C alleles -T $$opts{tmp}/$args{tab}.tab.gz $args $$opts{path}/$args{in}.vcf 2>/dev/null | $$opts{bin}/bcftools view | grep -v ^##bcftools_");
}
sub test_vcf_filter
{
Expand Down
150 changes: 127 additions & 23 deletions vcfcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ THE SOFTWARE. */
#include "prob1.h"
#include "ploidy.h"
#include "gvcf.h"
#include "regidx.h"

void error(const char *format, ...);

Expand Down Expand Up @@ -76,6 +77,8 @@ typedef struct
int nsamples, *samples_map; // mapping from output sample names to original VCF
char *regions, *targets; // regions to process
int regions_is_file, targets_is_file;
regidx_t *tgt_idx;
regitr_t *tgt_itr, *tgt_itr_prev;

char *samples_fname;
int samples_is_file;
Expand Down Expand Up @@ -347,26 +350,121 @@ static void init_missed_line(args_t *args)
bcf_float_set_missing(args->missed_line->qual);
}

static void print_missed_line(bcf_sr_regions_t *regs, void *data)
static int tgt_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
{
args_t *args = (args_t*) data;
call_t *call = &args->aux;
bcf1_t *missed = args->missed_line;
char *ss = (char*) line;
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }
if ( *ss=='#' ) return -1; // skip comments

char *ss = regs->line.s;
int i = 0;
while ( i<args->aux.srs->targets_als-1 && *ss )
char *se = ss;
while ( *se && !isspace(*se) ) se++;

*chr_beg = ss;
*chr_end = se-1;

if ( !*se ) { fprintf(stderr,"Could not parse the line: %s\n", line); return -2; }

ss = se+1;
*beg = strtod(ss, &se);
if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
(*beg)--;
*end = *beg;

if ( !usr ) return 0; // allele information not required

ss = se+1;
tgt_als_t *als = (tgt_als_t*)payload;
als->used = 0;
als->n = 0;
als->allele = NULL;
while ( *ss )
{
se = ss;
while ( *se && *se!=',' ) se++;
als->n++;
als->allele = (char**)realloc(als->allele,als->n*sizeof(*als->allele));
als->allele[als->n-1] = (char*)malloc(se-ss+1);
memcpy(als->allele[als->n-1],ss,se-ss);
als->allele[als->n-1][se-ss] = 0;
ss = se+1;
if ( !*se ) break;
}
return 0;
}
static void tgt_free(void *payload)
{
tgt_als_t *als = (tgt_als_t*)payload;
int i;
for (i=0; i<als->n; i++) free(als->allele[i]);
free(als->allele);
}
static int tgt_overlap(args_t *args, bcf1_t *rec)
{
if ( !regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec),rec->pos,rec->pos,args->tgt_itr) ) return 0;
while ( regitr_overlap(args->tgt_itr) )
{
if ( args->tgt_itr->beg != rec->pos ) continue;
if ( args->aux.flag & CALL_CONSTR_ALLELES )
{
args->aux.tgt_als = &regitr_payload(args->tgt_itr,tgt_als_t);
args->aux.tgt_als->used = 1;
}
if ( args->flag & CF_INS_MISSED )
{
if ( !args->tgt_itr_prev ) args->tgt_itr_prev = regitr_init(args->tgt_idx);
regitr_copy(args->tgt_itr_prev, args->tgt_itr);
}
return 1;
}
return 0;
}
static void tgt_flush_region(args_t *args, char *chr, uint32_t beg, uint32_t end)
{
if ( !regidx_overlap(args->tgt_idx, chr,beg,end,args->tgt_itr) ) return;
while ( regitr_overlap(args->tgt_itr) )
{
if ( *ss=='\t' ) i++;
ss++;
if ( args->tgt_itr->beg < beg ) continue;

tgt_als_t *tgt_als = &regitr_payload(args->tgt_itr,tgt_als_t);
if ( tgt_als->used ) return;

tgt_als->used = 1;
args->missed_line->rid = bcf_hdr_name2id(args->aux.hdr,chr);
args->missed_line->pos = args->tgt_itr->beg;
bcf_update_alleles(args->aux.hdr, args->missed_line, (const char**)tgt_als->allele, tgt_als->n);
if ( bcf_write1(args->out_fh, args->aux.hdr, args->missed_line)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
}
if ( !*ss ) error("Could not parse: [%s] (%d)\n", regs->line.s,args->aux.srs->targets_als);
}
static void tgt_flush(args_t *args, bcf1_t *rec)
{
if ( rec )
{
char *chr = (char*)bcf_seqname(args->aux.hdr,rec);

missed->rid = bcf_hdr_name2id(call->hdr,regs->seq_names[regs->prev_seq]);
missed->pos = regs->start;
bcf_update_alleles_str(call->hdr, missed,ss);
if ( !args->tgt_itr_prev ) // first record
tgt_flush_region(args,chr,0,rec->pos-1);

if ( bcf_write1(args->out_fh, call->hdr, missed)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args->output_fname);
else if ( strcmp(chr,args->tgt_itr->seq) ) // first record on a new chromosome
{
tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX);
tgt_flush_region(args,chr,0,rec->pos-1);
}
else // another record on the same chromosome
tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,rec->pos-1);
}
else
{
// flush everything
if ( args->tgt_itr_prev )
tgt_flush_region(args,args->tgt_itr_prev->seq,args->tgt_itr_prev->beg+1,REGIDX_MAX);

int i, nchr = 0;
char **chr = regidx_seq_names(args->tgt_idx, &nchr);
for (i=0; i<nchr; i++)
tgt_flush_region(args,chr[i],0,REGIDX_MAX);
}
}

static void init_data(args_t *args)
Expand All @@ -376,15 +474,10 @@ static void init_data(args_t *args)
// Open files for input and output, initialize structures
if ( args->targets )
{
if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
error("Failed to read the targets: %s\n", args->targets);

if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
{
args->aux.srs->targets->missed_reg_handler = print_missed_line;
args->aux.srs->targets->missed_reg_data = args;
}
args->tgt_idx = regidx_init(args->targets, tgt_parse, tgt_free, sizeof(tgt_als_t), args->aux.flag&CALL_CONSTR_ALLELES ? args : NULL);
args->tgt_itr = regitr_init(args->tgt_idx);
}

if ( args->regions )
{
if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
Expand Down Expand Up @@ -475,6 +568,12 @@ static void init_data(args_t *args)

static void destroy_data(args_t *args)
{
if ( args->tgt_idx )
{
regidx_destroy(args->tgt_idx);
regitr_destroy(args->tgt_itr);
if ( args->tgt_itr_prev ) regitr_destroy(args->tgt_itr_prev);
}
if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux);
else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux);
else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux);
Expand Down Expand Up @@ -819,6 +918,11 @@ int main_vcfcall(int argc, char *argv[])
if ( (args.flag & CF_INDEL_ONLY) && !is_indel ) continue;
if ( (args.flag & CF_NO_INDEL) && is_indel ) continue;
if ( (args.flag & CF_ACGT_ONLY) && (bcf_rec->d.allele[0][0]=='N' || bcf_rec->d.allele[0][0]=='n') ) continue; // REF[0] is 'N'
if ( args.tgt_idx )
{
if ( !tgt_overlap(&args, bcf_rec) ) continue;
if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,bcf_rec);
}

// Which allele is symbolic? All SNPs should have it, but not indels
args.aux.unseen = 0;
Expand Down Expand Up @@ -862,7 +966,7 @@ int main_vcfcall(int argc, char *argv[])
if ( bcf_rec && bcf_write1(args.out_fh, args.aux.hdr, bcf_rec)!=0 ) error("[%s] Error: failed to write to %s\n", __func__,args.output_fname);
}
if ( args.gvcf ) gvcf_write(args.gvcf, args.out_fh, args.aux.hdr, NULL, 0);
if ( args.flag & CF_INS_MISSED ) bcf_sr_regions_flush(args.aux.srs->targets);
if ( args.flag & CF_INS_MISSED ) tgt_flush(&args,NULL);
destroy_data(&args);
return 0;
}
Expand Down

0 comments on commit 8457b56

Please sign in to comment.