Skip to content

Commit

Permalink
New bcftools roh --AF-dflt option; changed -a,-H defaults; fixed a bu…
Browse files Browse the repository at this point in the history
…g in -e -
  • Loading branch information
pd3 authored and mcshane committed Jul 6, 2015
1 parent f506244 commit 3ba5f38
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 27 deletions.
8 changes: 8 additions & 0 deletions bcftools.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ THE SOFTWARE. */

#include <stdarg.h>
#include <htslib/vcf.h>
#include <math.h>

#define FT_GZ 1
#define FT_VCF 2
Expand Down Expand Up @@ -60,4 +61,11 @@ static inline char gt2iupac(char a, char b)
return iupac[(int)a][(int)b];
}

static inline double phred_score(double prob)
{
if ( prob==0 ) return 99;
prob = -4.3429*log(prob);
return prob>99 ? 99 : prob;
}

#endif
10 changes: 6 additions & 4 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1332,20 +1332,22 @@ Emission probabilities:
Transition probabilities:
tAZ = P(AZ|HW) .. from HW to AZ, the -a parameter
tHW = P(HW|AZ) .. from AZ to HW, the -H parameter
P(AZ|AZ) = 1 - P(HW|AZ) = 1 - tHW
P(HW|HW) = 1 - P(AZ|HW) = 1 - tAZ

ci = P_i(C) .. probability of cross-over at site i, from genetic map
AZi = P_i(AZ) .. probability of site i being AZ/non-AZ, scaled so that AZi+HWi = 1
HWi = P_i(HW)

P_{i+1}(AZ) = oAZ * max[(1-tHW) * (1-ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})]
P_{i+1}(HW) = oHW * max[(1-tAZ) * (1-ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]
P_{i+1}(AZ) = oAZ * max[(1 - tAZ * ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})]
P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]

--------------------------------------

==== General Options:

*--AF-dflt* 'FLOAT'::
in case allele frequency is not known, use the 'FLOAT'. By default, sites where
allele frequency cannot be determined, or is 0, are skipped.

*--AF-tag* 'TAG'::
use the specified INFO tag 'TAG' as an allele frequency estimate
instead of the defaul AC and AN tags. Sites which do not have 'TAG'
Expand Down
7 changes: 0 additions & 7 deletions vcfcnv.c
Original file line number Diff line number Diff line change
Expand Up @@ -518,13 +518,6 @@ static inline char copy_number_state(args_t *args, int istate, int ismpl)
return code[idx];
}

static double phred_score(double prob)
{
if ( prob==0 ) return 99;
prob = -4.3429*log(prob);
return prob>99 ? 99 : prob;
}

static double avg_ii_prob(int n, double *mat)
{
int i;
Expand Down
46 changes: 30 additions & 16 deletions vcfroh.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ typedef struct _args_t
bcf_srs_t *files;
bcf_hdr_t *hdr;
double t2AZ, t2HW; // P(AZ|HW) and P(HW|AZ) parameters
double unseen_PL;
double unseen_PL, dflt_AF;

char *genmap_fname;
genmap_t *genmap;
Expand Down Expand Up @@ -122,20 +122,21 @@ static void init_data(args_t *args)
}
free(smpls);
}
else
else if ( !args->estimate_AF )
kputs(args->sample, &str);

int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
if ( str.l )
{
int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
}

if ( args->af_tag )
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
error("No such INFO tag in the VCF: %s\n", args->af_tag);

if ( !bcf_sr_set_samples(args->files, str.s, 0) )
error("Error: could not set the samples %s\n", str.s);
args->nsmpl = args->files->n_smpl;
args->nsmpl = bcf_hdr_nsamples(args->hdr);
args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
free(str.s);

Expand Down Expand Up @@ -324,12 +325,16 @@ static void flush_viterbi(args_t *args)
{
// single viterbi pass, one chromsome
hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);

const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
for (i=0; i<args->nsites; i++)
{
printf("%s\t%d\t%d\t..\n", chr,args->sites[i]+1,vpath[i*2]==STATE_AZ ? 1 : 0);
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
double *pval = fwd + i*2;
printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
}
return;
}
Expand Down Expand Up @@ -363,13 +368,12 @@ static void flush_viterbi(args_t *args)
}
}


// update the transition matrix tprob
for (i=0; i<2; i++)
{
int n = 0;
for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
error("fixme: state %d not observed\n", i+1);
if ( !n) error("fixme: state %d not observed\n", i+1);
for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n;
}
if ( args->genmap_fname || args->rec_rate > 0 )
Expand Down Expand Up @@ -515,7 +519,11 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
}

if ( ret<0 ) return ret;

if ( *alt_freq==0.0 )
{
if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
*alt_freq = args->dflt_AF;
}

// Set P(D|G)
if ( args->fake_PLs )
Expand Down Expand Up @@ -652,6 +660,7 @@ static void usage(args_t *args)
fprintf(stderr, "Usage: bcftools roh [options] <in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "General Options:\n");
fprintf(stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
fprintf(stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
Expand All @@ -666,8 +675,8 @@ static void usage(args_t *args)
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, "\n");
fprintf(stderr, "HMM Options:\n");
fprintf(stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [1e-1]\n");
fprintf(stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [1e-1]\n");
fprintf(stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
fprintf(stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
fprintf(stderr, "\n");
exit(1);
Expand All @@ -679,15 +688,16 @@ int main_vcfroh(int argc, char *argv[])
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
args->files = bcf_sr_init();
args->t2AZ = 1e-1;
args->t2HW = 1e-1;
args->t2AZ = 6.7e-8;
args->t2HW = 5e-9;
args->rec_rate = 0;
int regions_is_file = 0, targets_is_file = 0;

static struct option loptions[] =
{
{"AF-tag",1,0,0},
{"AF-file",1,0,1},
{"AF-dflt",1,0,2},
{"estimate-AF",1,0,'e'},
{"GTs-only",1,0,'G'},
{"sample",1,0,'s'},
Expand All @@ -710,6 +720,10 @@ int main_vcfroh(int argc, char *argv[])
switch (c) {
case 0: args->af_tag = optarg; naf_opts++; break;
case 1: args->af_fname = optarg; naf_opts++; break;
case 2:
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
case 'e': args->estimate_AF = optarg; naf_opts++; break;
case 'I': args->snps_only = 1; break;
case 'G':
Expand Down

0 comments on commit 3ba5f38

Please sign in to comment.