diff --git a/bcftools.h b/bcftools.h index 7d3936ad7..6f22272b9 100644 --- a/bcftools.h +++ b/bcftools.h @@ -27,6 +27,7 @@ THE SOFTWARE. */ #include #include +#include #define FT_GZ 1 #define FT_VCF 2 @@ -60,4 +61,11 @@ static inline char gt2iupac(char a, char b) return iupac[(int)a][(int)b]; } +static inline double phred_score(double prob) +{ + if ( prob==0 ) return 99; + prob = -4.3429*log(prob); + return prob>99 ? 99 : prob; +} + #endif diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 6c2cde83e..1dca046f8 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1332,20 +1332,22 @@ Emission probabilities: Transition probabilities: tAZ = P(AZ|HW) .. from HW to AZ, the -a parameter tHW = P(HW|AZ) .. from AZ to HW, the -H parameter - P(AZ|AZ) = 1 - P(HW|AZ) = 1 - tHW - P(HW|HW) = 1 - P(AZ|HW) = 1 - tAZ ci = P_i(C) .. probability of cross-over at site i, from genetic map AZi = P_i(AZ) .. probability of site i being AZ/non-AZ, scaled so that AZi+HWi = 1 HWi = P_i(HW) - P_{i+1}(AZ) = oAZ * max[(1-tHW) * (1-ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})] - P_{i+1}(HW) = oHW * max[(1-tAZ) * (1-ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}] + P_{i+1}(AZ) = oAZ * max[(1 - tAZ * ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})] + P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}] -------------------------------------- ==== General Options: +*--AF-dflt* 'FLOAT':: + in case allele frequency is not known, use the 'FLOAT'. By default, sites where + allele frequency cannot be determined, or is 0, are skipped. + *--AF-tag* 'TAG':: use the specified INFO tag 'TAG' as an allele frequency estimate instead of the defaul AC and AN tags. Sites which do not have 'TAG' diff --git a/vcfcnv.c b/vcfcnv.c index d3d777b26..08b980c6b 100644 --- a/vcfcnv.c +++ b/vcfcnv.c @@ -518,13 +518,6 @@ static inline char copy_number_state(args_t *args, int istate, int ismpl) return code[idx]; } -static double phred_score(double prob) -{ - if ( prob==0 ) return 99; - prob = -4.3429*log(prob); - return prob>99 ? 99 : prob; -} - static double avg_ii_prob(int n, double *mat) { int i; diff --git a/vcfroh.c b/vcfroh.c index 69a1727c8..fa64b798e 100644 --- a/vcfroh.c +++ b/vcfroh.c @@ -49,7 +49,7 @@ typedef struct _args_t bcf_srs_t *files; bcf_hdr_t *hdr; double t2AZ, t2HW; // P(AZ|HW) and P(HW|AZ) parameters - double unseen_PL; + double unseen_PL, dflt_AF; char *genmap_fname; genmap_t *genmap; @@ -122,20 +122,21 @@ static void init_data(args_t *args) } free(smpls); } - else + else if ( !args->estimate_AF ) kputs(args->sample, &str); - int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); - if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); - else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); + if ( str.l ) + { + int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); + if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); + else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); + } if ( args->af_tag ) if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag in the VCF: %s\n", args->af_tag); - if ( !bcf_sr_set_samples(args->files, str.s, 0) ) - error("Error: could not set the samples %s\n", str.s); - args->nsmpl = args->files->n_smpl; + args->nsmpl = bcf_hdr_nsamples(args->hdr); args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); free(str.s); @@ -324,12 +325,16 @@ static void flush_viterbi(args_t *args) { // single viterbi pass, one chromsome hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites); + hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites); + double *fwd = hmm_get_fwd_bwd_prob(args->hmm); const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid); uint8_t *vpath = hmm_get_viterbi_path(args->hmm); for (i=0; insites; i++) { - printf("%s\t%d\t%d\t..\n", chr,args->sites[i]+1,vpath[i*2]==STATE_AZ ? 1 : 0); + int state = vpath[i*2]==STATE_AZ ? 1 : 0; + double *pval = fwd + i*2; + printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state])); } return; } @@ -363,13 +368,12 @@ static void flush_viterbi(args_t *args) } } - // update the transition matrix tprob for (i=0; i<2; i++) { int n = 0; for (j=0; j<2; j++) n += MAT(tcounts,2,i,j); - error("fixme: state %d not observed\n", i+1); + if ( !n) error("fixme: state %d not observed\n", i+1); for (j=0; j<2; j++) MAT(tcounts,2,i,j) /= n; } if ( args->genmap_fname || args->rec_rate > 0 ) @@ -515,7 +519,11 @@ int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg) } if ( ret<0 ) return ret; - + if ( *alt_freq==0.0 ) + { + if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0 + *alt_freq = args->dflt_AF; + } // Set P(D|G) if ( args->fake_PLs ) @@ -652,6 +660,7 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools roh [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "General Options:\n"); + fprintf(stderr, " --AF-dflt if AF is not known, use this allele frequency [skip]\n"); fprintf(stderr, " --AF-tag use TAG for allele frequency\n"); fprintf(stderr, " --AF-file read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n"); fprintf(stderr, " -e, --estimate-AF calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in \n"); @@ -666,8 +675,8 @@ static void usage(args_t *args) fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); fprintf(stderr, "\n"); fprintf(stderr, "HMM Options:\n"); - fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [1e-1]\n"); - fprintf(stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [1e-1]\n"); + fprintf(stderr, " -a, --hw-to-az P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n"); + fprintf(stderr, " -H, --az-to-hw P(HW|AZ) transition probability from AZ to HW state [5e-9]\n"); fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n"); fprintf(stderr, "\n"); exit(1); @@ -679,8 +688,8 @@ int main_vcfroh(int argc, char *argv[]) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); - args->t2AZ = 1e-1; - args->t2HW = 1e-1; + args->t2AZ = 6.7e-8; + args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; @@ -688,6 +697,7 @@ int main_vcfroh(int argc, char *argv[]) { {"AF-tag",1,0,0}, {"AF-file",1,0,1}, + {"AF-dflt",1,0,2}, {"estimate-AF",1,0,'e'}, {"GTs-only",1,0,'G'}, {"sample",1,0,'s'}, @@ -710,6 +720,10 @@ int main_vcfroh(int argc, char *argv[]) switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; + case 2: + args->dflt_AF = strtod(optarg,&tmp); + if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); + break; case 'e': args->estimate_AF = optarg; naf_opts++; break; case 'I': args->snps_only = 1; break; case 'G':