Skip to content

Commit

Permalink
call: new -F, --prior-freqs option
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 authored and mcshane committed May 3, 2016
1 parent 0c88e1a commit 0a5e8f8
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 2 deletions.
1 change: 1 addition & 0 deletions call.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ typedef struct
double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes()
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)

// ccall only
double indel_frac, min_perm_p, min_lrt;
Expand Down
20 changes: 20 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,26 @@ demand. The original calling model can be invoked with the *-c* option.
GQ and GP fields are supported. For convenience, the fields can be given
as lower case letters.

*-F, --prior-freqs* 'AN','AC'::
take advantage of prior knowledge of population allele frequencies. The
workflow looks like this:
----
# Extract AN,AC values from an existing VCF, such 1000Genomes
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t%AN\t%AC\n' 1000Genomes.bcf | bgzip -c > AFs.tab.gz

# If the tags AN,AC are not already present, use the +fill-AN-AC plugin
bcftools +fill-AN-AC 1000Genomes.bcf | bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t%AN\t%AC\n' | bgzip -c > AFs.tab.gz
tabix -s1 -b2 -e2 AFs.tab.gz

# Create a VCF header description, here we name the tags REF_AN,REF_AC
cat AFs.hdr
##INFO=<ID=REF_AN,Number=1,Type=Integer,Description="Total number of alleles in reference genotypes">
##INFO=<ID=REF_AC,Number=A,Type=Integer,Description="Allele count in reference genotypes for each ALT allele">

# Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies
bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
----

*-g, --gvcf* 'INT'::
output also gVCF blocks of homozygous REF calls. The parameter 'INT' is the
minimum per-sample depth required to include a site in the non-variant
Expand Down
33 changes: 32 additions & 1 deletion mcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ int calc_Pkij(int fals, int mals, int kals, int fpl, int mpl, int kpl)
//
static void mcall_init_trios(call_t *call)
{
if ( call->prior_AN )
{
int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
}

// 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
Expand Down Expand Up @@ -1393,7 +1403,7 @@ int mcall(call_t *call, bcf1_t *rec)
#if QS_FROM_PDG
estimate_qsum(call, rec);
#else
// Get sum of qualities
// Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
if ( nqs < nals )
Expand All @@ -1404,6 +1414,27 @@ int mcall(call_t *call, bcf1_t *rec)
hts_expand(float,nals,call->nqsum,call->qsum);
for (i=nqs; i<nals; i++) call->qsum[i] = 0;
}

// If available, take into account reference panel AFs
if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
int an = call->ac[0];
if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
{
int ac0 = an; // number of alleles in the reference population
for (i=0; i<nals-1; i++)
{
if ( call->ac[i]==bcf_int32_vector_end ) break;
if ( call->ac[i]==bcf_int32_missing ) continue;
ac0 -= call->ac[i];
call->qsum[i+1] += call->ac[i]*0.5;
}
if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
call->qsum[0] += ac0*0.5;
for (i=0; i<nals; i++) call->qsum[i] /= nsmpl + 0.5*an;
}
}

float qsum_tot = 0;
for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
if ( !call->qsum[0] )
Expand Down
11 changes: 10 additions & 1 deletion vcfcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,7 @@ static void usage(args_t *args)
fprintf(stderr, "Input/output options:\n");
fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
fprintf(stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
fprintf(stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
Expand Down Expand Up @@ -667,6 +668,7 @@ int main_vcfcall(int argc, char *argv[])
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
{"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
Expand Down Expand Up @@ -698,7 +700,7 @@ int main_vcfcall(int argc, char *argv[])
};

char *tmp = NULL;
while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0)
{
switch (c)
{
Expand All @@ -713,6 +715,13 @@ int main_vcfcall(int argc, char *argv[])
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
case 'i': args.flag |= CF_INS_MISSED; break;
case 'v': args.aux.flag |= CALL_VARONLY; break;
case 'F':
args.aux.prior_AN = optarg;
args.aux.prior_AC = index(optarg,',');
if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
*args.aux.prior_AC = 0;
args.aux.prior_AC++;
break;
case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
Expand Down

0 comments on commit 0a5e8f8

Please sign in to comment.