From c4172c5eca9e03113b028676b993c7279e5b1477 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 1 Oct 2013 13:45:18 +0100 Subject: [PATCH] call: Trio constrained calling (no sex chromosomes yet) --- Makefile | 4 +- call.h | 42 +++- mcall.c | 612 +++++++++++++++++++++++++++++++++++---------------- test/test.pl | 7 + vcfcall.c | 273 ++++++++++++++++------- 5 files changed, 659 insertions(+), 279 deletions(-) diff --git a/Makefile b/Makefile index 31703311b..cbc6adae9 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,8 @@ DFLAGS= OBJS= main.o vcfview.o bcfidx.o tabix.o \ vcfstats.o vcfisec.o vcfmerge.o vcfquery.o vcffilter.o filter.o vcfsom.o \ vcfnorm.o vcfgtcheck.o vcfsubset.o \ - vcfcall.o mcall.o + vcfcall.o mcall.o \ + ccall.o em.o prob1.o kmin.o # the original samtools calling INCLUDES= -I. -I$(HTSDIR) SUBDIRS= . @@ -60,6 +61,7 @@ test: main.o: version.h $(HTSDIR)/version.h bcftools.h vcfcall.o: vcfcall.c call.h mcall.c prob1.h $(HTSDIR)/htslib/kfunc.h $(HTSDIR)/htslib/vcf.h +mcall.o ccall.o: call.h vcffilter.o: filter.h vcfsubset.o: filter.h diff --git a/call.h b/call.h index 7a900c6ab..2ce86cfac 100644 --- a/call.h +++ b/call.h @@ -3,37 +3,59 @@ #include -#define CALL_KEEPALT 1 -#define CALL_VARONLY (1<<1) +#define CALL_KEEPALT 1 +#define CALL_VARONLY (1<<1) +#define CALL_CONSTR_ALLELES (1<<2) +#define CALL_CONSTR_TRIO (1<<3) +#define FATHER 0 +#define MOTHER 1 +#define CHILD 2 +typedef struct +{ + char *name; + int sample[3]; +} +family_t; + +typedef struct _ccall_t ccall_t; typedef struct { // mcall only double min_ma_lrt; // variant accepted if P(chi^2)>=FLOAT [0.99] - int *PLs, nPLs, *gts; // VCF PL likelihoods (rw); GTs (w) - float anno16[16]; // see anno[16] in bam2bcf.h - double *pl2p, *pdg; // PL to 10^(-PL/10) table; PLs converted to P(D|G) float *qsum; // QS(sum) values int nqsum, npdg; int *als_map, nals_map; // mapping from full set of alleles to trimmed set of alleles (old -> new) int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old) char **als; // array to hold the trimmed set of alleles to appear on output int nals; // size of the als array + family_t *fams; // list of families and samples for trio calling + int nfams, mfams; + int ntrio[5]; // possible trio genotype combinations + uint16_t *trio[5]; + double *GLs, *sumGLs; + int *GQs; // VCF FORMAT genotype qualities // ccall only double indel_frac, theta, min_lrt, min_perm_p; double prior_type, pref; + double ref_lk, lk_sum; int ngrp1_samples, n_perm; + int ac[4], nhets, ndiploid; char *prior_file; + ccall_t *cdat; // shared bcf1_t *rec; bcf_hdr_t *hdr; - - uint32_t flag; // One or more of the CALL_* flags defined above + uint32_t flag; // One or more of the CALL_* flags defined above uint8_t *ploidy; - int nsamples; - uint32_t trio; + + double pl2p[256]; // PL to 10^(-PL/10) table + int *PLs, nPLs, mPLs; // VCF PL likelihoods (rw) + int *gts; // GTs (w) + double *pdg; // PLs converted to P(D|G) + float *anno16; int n16; // see anno[16] in bam2bcf.h } call_t; @@ -55,5 +77,7 @@ void mcall_destroy(call_t *call); void ccall_destroy(call_t *call); void qcall_destroy(call_t *call); +void call_init_pl2p(call_t *call); +uint32_t *call_trio_prep(int is_x, int is_son); #endif diff --git a/mcall.c b/mcall.c index c6fcd3820..019c03375 100644 --- a/mcall.c +++ b/mcall.c @@ -2,9 +2,9 @@ #include #include "call.h" -void ccall_init(call_t *call) { return; } -void ccall_destroy(call_t *call) { return; } -int ccall(call_t *call, bcf1_t *rec) { return 0; } +// void ccall_init(call_t *call) { return; } +// void ccall_destroy(call_t *call) { return; } +// int ccall(call_t *call, bcf1_t *rec) { return 0; } void qcall_init(call_t *call) { return; } void qcall_destroy(call_t *call) { return; } @@ -16,12 +16,52 @@ int qcall(call_t *call, bcf1_t *rec) return 0; } -void mcall_init(call_t *call) -{ - call->pl2p = (double*) malloc(sizeof(double)*256); +void call_init_pl2p(call_t *call) +{ int i; for (i=0; i<256; i++) call->pl2p[i] = pow(10., -i/10.); +} + +static void mcall_init_trios(call_t *call) +{ + // 23, 138, 478 possible trio genotypes with 2, 3, 4 alleles + call->ntrio[2] = 23, call->ntrio[3] = 138, call->ntrio[4] = 478; + call->trio[2] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[2]); + call->trio[3] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[3]); + call->trio[4] = (uint16_t*) malloc(sizeof(uint16_t)*call->ntrio[4]); + + // max 10 possible diploid genotypes + int gts[10], nals; + for (nals=2; nals<=4; nals++) + { + int i,j,k, n = 0, ngts = 0; + for (i=0; intrio[nals] ); + call->trio[nals][n++] = i<<8 | j<<4 | k; // father, mother, child + } + } + call->GLs = (double*) malloc(sizeof(double)*bcf_nsamples(call->hdr)*10); + call->sumGLs = (double*) malloc(sizeof(double)*bcf_nsamples(call->hdr)); + call->GQs = (int*) malloc(sizeof(int)*bcf_nsamples(call->hdr)); +} +static void mcall_destroy_trios(call_t *call) +{ + int i; + for (i=2; i<=4; i++) free(call->trio[i]); +} + +void mcall_init(call_t *call) +{ + call_init_pl2p(call); call->nqsum = 4; call->qsum = (float*) malloc(sizeof(float)*call->nqsum); @@ -29,10 +69,14 @@ void mcall_init(call_t *call) call->als_map = (int*) malloc(sizeof(int)*call->nals_map); call->npl_map = 4*(4+1)/2; call->pl_map = (int*) malloc(sizeof(int)*call->npl_map); - call->gts = (int*) calloc(call->hdr->n[BCF_DT_SAMPLE]*2,sizeof(int)); // assuming at most diploid everywhere + call->gts = (int*) calloc(bcf_nsamples(call->hdr)*2,sizeof(int)); // assuming at most diploid everywhere + + if ( call->flag & CALL_CONSTR_TRIO ) mcall_init_trios(call); bcf_hdr_append(call->hdr,"##FORMAT="); + bcf_hdr_append(call->hdr,"##FORMAT="); bcf_hdr_append(call->hdr,"##INFO="); + bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); @@ -42,7 +86,11 @@ void mcall_init(call_t *call) void mcall_destroy(call_t *call) { - free(call->pl2p); + mcall_destroy_trios(call); + free(call->GLs); + free(call->sumGLs); + free(call->GQs); + free(call->anno16); free(call->PLs); free(call->qsum); free(call->als_map); @@ -58,7 +106,10 @@ void mcall_destroy(call_t *call) // depth, missing PLs are all zero. In this case, pdg's are set to 0 // so that the corresponding genotypes can be set as missing and the // qual calculation is not affected. -void set_pdg(call_t *call, int *PLs, double *pdg, int n_smpl, int n_gt) +// NB: While the -m callig model uses the pdgs in canonical order, +// the original samtools -c calling code uses pdgs in reverse order (AA comes +// first, RR last). +void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt) { int i, j; for (i=0; ipl2p[ PLs[j] ]; + pdg[j] = pl2p[ PLs[j] ]; sum += pdg[j]; } // Normalize: sum_i pdg_i = 1 @@ -136,19 +187,28 @@ float calc_ICB(int nref, int nalt, int nhets, int ndiploid) double q = 2*fref*falt; // probability of a het, assuming HWE double mean = q*ndiploid; - fprintf(stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); + //fprintf(stderr,"\np=%e N=%d k=%d .. nref=%d nalt=%d nhets=%d ndiploid=%d\n", q,ndiploid,nhets, nref,nalt,nhets,ndiploid); // Can we use normal approximation? The second condition is for performance only // and is not well justified. if ( (mean>10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) { - fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); + //fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); } return binom_dist(ndiploid, q, nhets); } +float calc_HOB(int nref, int nalt, int nhets, int ndiploid) +{ + if ( !nref || !nalt || !ndiploid ) return HUGE_VAL; + + double fref = (double)nref/(nref+nalt); // fraction of reference allelels + double falt = (double)nalt/(nref+nalt); // non-ref als + return fabs((double)nhets/ndiploid - 2*fref*falt); +} + /** * log(sum_i exp(a_i)) */ @@ -182,40 +242,16 @@ inline double logsumexp2(double a, double b) #define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; } - -/** - * This function implements the multiallelic calling model. It has two major parts: - * 1) determine the most likely set of alleles and calculate the quality of ref/non-ref site - * 2) determine and set the genotypes - * In various places in between, the BCF record gets updated. - */ -int mcall(call_t *call, bcf1_t *rec) +// Determine the most likely combination of alleles. In this implementation, +// at most tri-allelic sites are considered. Returns the number of alleles. +static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { - int nals = rec->n_allele; - if ( nals>4 ) - error("FIXME: Not ready for more than 4 alleles at %s:%d (%d)\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1, nals); - - // Get the genotype likelihoods - int nsmpl = call->hdr->n[BCF_DT_SAMPLE]; - int npl = bcf_get_format_int(call->hdr, rec, "PL", &call->PLs, &call->nPLs); - if ( npl!=nsmpl*nals*(nals+1)/2 && npl!=nsmpl*nals ) - error("Wrong number of PL fields? nals=%d npl=%d\n", nals,npl); - - // Convert PLs to probabilities - int ngts = nals*(nals+1)/2; - hts_expand(double, npl, call->npdg, call->pdg); - set_pdg(call, call->PLs, call->pdg, call->hdr->n[BCF_DT_SAMPLE], ngts); - - // Get sum of qualities - int i, nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); - assert( nals<=call->nqsum ); - for (i=nqs; iqsum[i] = 0; - - // 1) Determine the most likely combination of alleles. In this implementation, at most three-allelic sites are considered. int ia,ib,ic; // iterators over up to three alleles int max_als=0, max_als2=0; // most likely and second-most likely combination of alleles double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN; // likelihood of the reference and of most likely combination of alleles double lk_sum = -HUGE_VAL; // for normalizing the likelihoods + int nsmpl = bcf_nsamples(call->hdr); + int ngts = nals*(nals+1)/2; // Single allele for (ia=0; iaflag & CALL_KEEPALT ) - { - n1 = 0; - for (i=0; id.allele[i][0]!='X' ) { max_als |= 1<ref_lk = ref_lk; + call->lk_sum = lk_sum; + *out_als = max_als; + return n1; +} + +static void mcall_set_ref_genotypes(call_t *call, int nals) +{ + int i; + int ngts = nals*(nals+1)/2; + int nsmpl = bcf_nsamples(call->hdr); + + for (i=0; i<4; i++) call->ac[i] = 0; + call->nhets = 0; + call->ndiploid = 0; + + // Set all genotypes to 0/0 and remove PL vector + int *gts = call->gts; + double *pdg = call->pdg; + int isample; + for (isample = 0; isample < nsmpl; isample++) { - max_als |= 1; - n1++; + int ploidy = call->ploidy ? call->ploidy[isample] : 2; + for (i=0; iac[0] += ploidy; + } + gts += 2; + pdg += ngts; } +} +static void mcall_call_genotypes(call_t *call, int nals, int nout_als, int out_als) +{ + int ia, ib, i; + int ngts = nals*(nals+1)/2; + int nsmpl = bcf_nsamples(call->hdr); - // 2) Set the genotypes - int ac[4] = {0,0,0,0}; - if ( max_als==1 ) + for (i=0; i<4; i++) call->ac[i] = 0; + call->nhets = 0; + call->ndiploid = 0; + + double *pdg = call->pdg - ngts; + int *gts = call->gts - 2; + + int isample; + for (isample = 0; isample < nsmpl; isample++) { - // Reference only calls: return or set all genotypes to 0/0 - if ( call->flag & CALL_VARONLY ) return 0; - init_allele_trimming_maps(call, max_als, nals); + int ploidy = call->ploidy ? call->ploidy[isample] : 2; - // Set the quality - rec->qual = lk_sum==-HUGE_VAL ? 0 : -4.343*log(1 - exp(ref_lk - lk_sum)); + pdg += ngts; + gts += 2; - // Set all genotypes to 0/0 and remove PL vector - int *gts = call->gts; - double *pdg = call->pdg; - int isample; - for (isample = 0; isample < nsmpl; isample++) + // Skip samples with all pdg's equal to 1. These have zero depth. + for (i=0; iploidy ? call->ploidy[isample] : 2; - for (i=0; indiploid++; + + // Non-zero depth, determine the most likely genotype + double best_lk = 0; + for (ia=0; iaqsum[ia]*call->qsum[ia]; + if ( best_lk < lk ) + { + best_lk = lk; + gts[0] = bcf_gt_unphased(call->als_map[ia]); + } } - else + if ( ploidy==2 ) { - gts[0] = bcf_gt_unphased(0); - gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; - ac[0] += ploidy; + gts[1] = gts[0]; + for (ia=0; iaqsum[ia]*call->qsum[ib]; + if ( best_lk < lk ) + { + best_lk = lk; + gts[0] = bcf_gt_unphased(call->als_map[ia]); + gts[1] = bcf_gt_unphased(call->als_map[ib]); + } + } + } + if ( gts[0] != gts[1] ) call->nhets++; } - gts += 2; - pdg += ngts; + else + gts[1] = bcf_int32_vector_end; } - bcf1_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now + + call->ac[ bcf_gt_allele(gts[0]) ]++; + if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++; } - else +} + + +static void mcall_call_trio_genotypes(call_t *call, int nals, int nout_als, int out_als) +{ + int ia, ib, i; + int nsmpl = bcf_nsamples(call->hdr); + int ngts = nals*(nals+1)/2; + double *gls = call->GLs - ngts; + double *pdg = call->pdg - ngts; + + // Collect unconstrained genotype likelihoods + int isample; + for (isample = 0; isample < nsmpl; isample++) { - // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes. - // Note that it is a valid outcome if the called genotypes exclude some of the ALTs. - init_allele_trimming_maps(call, max_als, nals); + int ploidy = call->ploidy ? call->ploidy[isample] : 2; - int npls_src = ngts, npls_dst = n1*(n1+1)/2; // number of PL values in diploid samples, ori and new - double *pdg = call->pdg - ngts; - int *gts = call->gts - 2; - int *pls_src = call->PLs - npls_src, *pls_dst = call->PLs - npls_dst; + gls += ngts; + pdg += ngts; - int isample, ndiploid = 0, nhets = 0; - for (isample = 0; isample < nsmpl; isample++) + // Skip samples with all pdg's equal to 1. These have zero depth. + for (i=0; iploidy ? call->ploidy[isample] : 2; + if ( !(out_als & 1<als_map[ia]+1)*(call->als_map[ia]+2)/2-1; + double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia]; + sum_lk += lk; + gls[idx] = log(lk); + } + if ( ploidy==2 ) + { + for (ia=0; iaals_map[ia]+1)*(call->als_map[ia]+2)/2-1; + for (ib=0; ibals_map[ia] + call->als_map[ib]; + double lk = 2*pdg[iab]*call->qsum[ia]*call->qsum[ib]; + sum_lk += lk; + gls[idx] = log(lk); + } + } + } + call->sumGLs[isample] = log(sum_lk); + } - pdg += ngts; - gts += 2; - pls_src += npls_src; - pls_dst += npls_dst; + for (i=0; i<4; i++) call->ac[i] = 0; + call->nhets = 0; + call->ndiploid = 0; - // Skip samples with all pdg's equal to 1. These have zero depth. - for (i=0; intrio[nout_als]; + uint16_t *trio = call->trio[nout_als]; + + // Calculate constrained likelihoods and determine genotypes + int ifm; + for (ifm=0; ifmnfams; ifm++) + { + family_t *fam = &call->fams[ifm]; + + // Best constrained likelihood + double cbest = -HUGE_VAL; + int ibest = 0, itr; + for (itr=0; itrsample[i]; + double *gl = call->GLs + ngts*ismpl; + int igt = trio[itr]>>((2-i)*4) & 0xf; + lk += gl[igt]; } + if ( cbest < lk ) { cbest = lk; ibest = itr; } + } + + // Set the genotype quality. The current way is not correct, P(igt) != P(igt|constraint) + for (i=0; i<3; i++) + { + int igt = trio[ibest]>>((2-i)*4) & 0xf; + int ismpl = fam->sample[i]; + double *gl = call->GLs + ngts*ismpl; + if ( gl[0]==1 ) + call->GQs[ismpl] = bcf_int32_missing; else { - if ( ploidy==2 ) ndiploid++; - - // Non-zero depth, determine the most likely genotype - int ia; - double best_lk = 0; - for (ia=0; iaqsum[ia]*call->qsum[ia]; - if ( best_lk < lk ) - { - best_lk = lk; - gts[0] = bcf_gt_unphased(call->als_map[ia]); - } - } - if ( ploidy==2 ) - { - gts[1] = gts[0]; - for (ia=0; iaqsum[ia]*call->qsum[ib]; - if ( best_lk < lk ) - { - best_lk = lk; - gts[0] = bcf_gt_unphased(call->als_map[ia]); - gts[1] = bcf_gt_unphased(call->als_map[ib]); - } - } - } - if ( gts[0] != gts[1] ) nhets++; - } - else - gts[1] = bcf_int32_vector_end; + double pval = -4.343*log(1.0 - exp(gl[igt] - call->sumGLs[ismpl])); + call->GQs[ismpl] = pval > 99 ? 99 : pval; + if ( call->GQs[ismpl] > 99 ) call->GQs[ismpl] = 99; } + } - // Subset PL vector - if ( npls_src!=npls_dst ) + // Set genotypes for father, mother, child + for (i=0; i<3; i++) + { + int ismpl = fam->sample[i]; + int ploidy = call->ploidy ? call->ploidy[ismpl] : 2; + double *gl = call->GLs + ngts*ismpl; + int *gts = call->gts + 2*ismpl; + if ( gl[0]==1 ) // zero depth, set missing genotypes { - if ( ploidy==2 ) - { - for (ia=0; iapl_map[ia] ]; - } - else - { - for (ia=0; iapl_map[ia]; - isrc = (isrc+1)*(isrc+2)/2-1; - pls_dst[ia] = pls_src[isrc]; - } - pls_dst[ia] = bcf_int32_vector_end; - } + gts[0] = bcf_gt_missing; + gts[1] = ploidy==2 ? bcf_gt_missing : bcf_int32_vector_end; + continue; } - - ac[ bcf_gt_allele(gts[0]) ]++; - if ( gts[1]!=bcf_int32_vector_end ) ac[ bcf_gt_allele(gts[1]) ]++; + int igt = trio[ibest]>>((2-i)*4) & 0xf; + + // Convert genotype index idx to allele indices + int k = 0, dk = 1; + while ( kac[ib]++; + if ( ploidy==2 ) + { + call->ac[ia]++; + if ( ia!=ib ) call->nhets++; + gts[1] = bcf_gt_unphased(ia); + call->ndiploid++; + } + else + gts[1] = bcf_int32_vector_end; } + } +} - if ( npls_src!=npls_dst ) - bcf1_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl); +static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +{ + int ngts = nals*(nals+1)/2; + int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new + if ( npls_src == npls_dst ) return; - // Skip the site if all samples are 0/0. This can happen occasionally. - if ( ac[1]+ac[2]+ac[3]==0 ) + int *pls_src = call->PLs, *pls_dst = call->PLs; + + int nsmpl = bcf_nsamples(call->hdr); + int isample, ia; + for (isample = 0; isample < nsmpl; isample++) + { + int ploidy = call->ploidy ? call->ploidy[isample] : 2; + if ( ploidy==2 ) { - if ( call->flag & CALL_VARONLY ) return 0; + for (ia=0; iapl_map[ia] ]; } else { - float icb = calc_ICB(ac[0],ac[1]+ac[2]+ac[3], nhets, ndiploid); - if ( icb != HUGE_VAL ) bcf1_update_info_float(call->hdr, rec, "ICB", &icb, 1); + for (ia=0; iapl_map[ia]; + isrc = (isrc+1)*(isrc+2)/2-1; + pls_dst[ia] = pls_src[isrc]; + } + if ( iahdr, rec, "PL", call->PLs, npls_dst*nsmpl); +} - // Set the quality of a REF - if ( lk_sum==-HUGE_VAL ) - rec->qual = 0; - else - rec->qual = n1==1 ? -4.343*log(1 - exp(ref_lk - lk_sum)) : -4.343*(ref_lk - lk_sum); + +/** + * This function implements the multiallelic calling model. It has two major parts: + * 1) determine the most likely set of alleles and calculate the quality of ref/non-ref site + * 2) determine and set the genotypes + * In various places in between, the BCF record gets updated. + */ +int mcall(call_t *call, bcf1_t *rec) +{ +//if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec); + + int nals = rec->n_allele; + if ( nals>4 ) + error("FIXME: Not ready for more than 4 alleles at %s:%d (%d)\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1, nals); + + // Get the genotype likelihoods + int nsmpl = bcf_nsamples(call->hdr); + call->nPLs = bcf_get_format_int(call->hdr, rec, "PL", &call->PLs, &call->mPLs); + if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only + error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs); + + // Convert PLs to probabilities + int ngts = nals*(nals+1)/2; + hts_expand(double, call->nPLs, call->npdg, call->pdg); + set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts); + + // Get sum of qualities + int i, nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum); + assert( nals<=call->nqsum ); + for (i=nqs; iqsum[i] = 0; + + // Find the best combination of alleles + int out_als, nout = mcall_find_best_alleles(call, nals, &out_als); + + // With -A, keep all ALTs except X + if ( call->flag & CALL_KEEPALT ) + { + nout = 0; + for (i=0; id.allele[i][0]!='X' ) { out_als |= 1<flag & CALL_VARONLY && out_als==1 ) return 0; + int nAC = 0; + if ( out_als==1 ) + { + init_allele_trimming_maps(call, 1, 1); + mcall_set_ref_genotypes(call,nals); + bcf1_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now + } + else + { + // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes. + // Note that it is a valid outcome if the called genotypes exclude some of the ALTs. + init_allele_trimming_maps(call, out_als, nals); + if ( call->flag & CALL_CONSTR_TRIO ) + { + mcall_call_trio_genotypes(call,nals,nout,out_als); + bcf1_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); + } + else + mcall_call_genotypes(call,nals,nout,out_als); + + // Skip the site if all samples are 0/0. This can happen occasionally. + nAC = call->ac[1] + call->ac[2] + call->ac[3]; + if ( !nAC && call->flag & CALL_VARONLY ) return 0; + mcall_trim_PLs(call, rec, nals, nout, out_als); + } + + // Set QUAL and calculate HWE-related annotations + if ( nAC ) + { + float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid); + if ( icb != HUGE_VAL ) bcf1_update_info_float(call->hdr, rec, "ICB", &icb, 1); + + float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid); + if ( hob != HUGE_VAL ) bcf1_update_info_float(call->hdr, rec, "HOB", &hob, 1); + + // Quality of a variant site + rec->qual = call->lk_sum==-HUGE_VAL ? 0 : -4.343*(call->ref_lk - call->lk_sum); } + else + { + // Set the quality of a REF site + rec->qual = call->lk_sum==-HUGE_VAL ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum)); + } + if ( rec->qual>999 ) rec->qual = 999; + if ( rec->qual>50 ) rec->qual = rint(rec->qual); + + // AC, AN + if ( nout>1 ) bcf1_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1); + nAC += call->ac[0]; + bcf1_update_info_int32(call->hdr, rec, "AN", &nAC, 1); - hts_expand(char*,n1,call->nals,call->als); + // Remove unused alleles + hts_expand(char*,nout,call->nals,call->als); for (i=0; ials_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i]; - bcf1_update_alleles(call->hdr, rec, (const char**)call->als, n1); + bcf1_update_alleles(call->hdr, rec, (const char**)call->als, nout); bcf1_update_genotypes(call->hdr, rec, call->gts, nsmpl*2); // DP4 tag - int ndp = 16; float *anno16 = call->anno16; - bcf_get_info_float(call->hdr, rec, "I16", &anno16, &ndp); - assert( anno16==call->anno16 ); // this shouldn't happen unless the VCF is broken + if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)!=16 ) + error("I16 hasn't 16 fields at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1); int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3]; bcf1_update_info_int32(call->hdr, rec, "DP4", dp, 4); bcf1_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag bcf1_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - // AC, AN - if ( n1>1 ) bcf1_update_info_int32(call->hdr, rec, "AC", ac+1, n1-1); - ac[0] += ac[1] + ac[2] + ac[3]; - bcf1_update_info_int32(call->hdr, rec, "AN", ac, 1); - - if ( rec->qual>999 ) rec->qual = 999; - if ( rec->qual>50 ) rec->qual = rint(rec->qual); - - return n1; + return nout; } diff --git a/test/test.pl b/test/test.pl index 4e3ae8b52..bf8b8efaf 100755 --- a/test/test.pl +++ b/test/test.pl @@ -25,6 +25,7 @@ test_vcf_subset($opts,in=>'subset',out=>'subset.2.out',args=>'-f PASS -k',reg=>'-r20,Y'); test_vcf_subset($opts,in=>'subset',out=>'subset.3.out',args=>'-ps NA00003',reg=>''); test_vcf_subset($opts,in=>'subset',out=>'subset.4.out',args=>q[-i '%QUAL==999 && (FS<20 || FS>=41.02) && HWE*2>1.2'],reg=>''); +test_vcf_call($opts,in=>'mpileup',out=>'mpileup.1.out',args=>'-mv'); print "\nNumber of tests:\n"; printf " total .. %d\n", $$opts{nok}+$$opts{nfailed}; @@ -264,4 +265,10 @@ sub test_vcf_subset bgzip_tabix_vcf($opts,$args{in}); test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools subset $args{args} $$opts{tmp}/$args{in}.vcf.gz $args{reg} | grep -v ^##bcftools_subset"); } +sub test_vcf_call +{ + my ($opts,%args) = @_; + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools call $args{args} $$opts{path}/$args{in}.vcf | grep -v ^##bcftools_call"); +} + diff --git a/vcfcall.c b/vcfcall.c index d81e3deda..35a00170f 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "bcftools.h" #include "call.h" #include "prob1.h" @@ -67,6 +68,97 @@ typedef struct args_t; +static family_t *get_family(family_t *fam, int nfam, char *name) +{ + int i; + for (i=0; i= 0) + { + char *col_ends[5], *tmp = s.s; + i = 0; + while ( *tmp && i<5 ) + { + if ( *tmp=='\t' || *tmp==' ' ) + { + col_ends[i] = tmp; + *tmp = 0; + i++; + } + tmp++; + } + if ( i!=5 ) break; + + family_t *fam = get_family(call->fams, call->nfams, s.s); + if ( !fam ) + { + call->nfams++; + hts_expand(family_t, call->nfams, call->mfams, call->fams); + fam = &call->fams[call->nfams-1]; + fam->name = strdup(s.s); + for (i=0; i<3; i++) fam->sample[i] = -1; + } + sam = add_sample(sam, &n, &max, col_ends[0]+1, &i); + if ( col_ends[2]-col_ends[1]!=2 || col_ends[1][1]!='0' ) // father + { + if ( fam->sample[CHILD]>=0 ) error("Multiple childs in %s\n", s.s); + fam->sample[CHILD] = i; + if ( fam->sample[FATHER]>=0 ) error("Two fathers in %s?\n", s.s); + sam = add_sample(sam, &n, &max, col_ends[1]+1, &fam->sample[FATHER]); + } + if ( col_ends[3]-col_ends[2]!=2 || col_ends[2][1]!='0' ) // mother + { + if ( fam->sample[MOTHER]>=0 ) error("Two mothers in %s?\n", s.s); + sam = add_sample(sam, &n, &max, col_ends[2]+1, &fam->sample[MOTHER]); + } + } + for (i=0; infams; i++) + assert( call->fams[i].sample[0]>=0 && call->fams[i].sample[1]>=0 && call->fams[i].sample[2]>=0 ); //todo + + ks_destroy(ks); + gzclose(fp); + free(s.s); + *_n = n; + return sam; +} + + /* * Reads sample names and their ploidy (optional) from a file. * Alternatively, if no such file exists, the file name is interpreted @@ -76,18 +168,16 @@ args_t; * Returns an array of sample names, where the byte value just after \0 * indicates the ploidy. */ -static char **read_samples(const char *fn, int *_n) +static char **read_samples(call_t *call, const char *fn, int *_n) { - gzFile fp; - kstream_t *ks; - kstring_t s = {0,0,0}; int dret, n = 0, max = 0; char **sam = 0; *_n = 0; - fp = gzopen(fn, "r"); - if (fp == 0) + + struct stat sbuf; + if ( stat(fn, &sbuf) != 0 ) { - // interpret as sample names, not as a file name + // it is not a file, interpret as list of sample names const char *t = fn, *p = t; while (*t) { @@ -106,19 +196,25 @@ static char **read_samples(const char *fn, int *_n) *_n = n; return sam; } + + sam = read_ped_samples(call, fn, _n); + if ( sam ) return sam; + + kstream_t *ks; + kstring_t s = {0,0,0}; + gzFile fp; + fp = gzopen(fn, "r"); + if ( !fp ) error("Could not read the file: %s\n", fn); ks = ks_init(fp); - while (ks_getuntil(ks, 0, &s, &dret) >= 0) { - int l; - if (max == n) { - max = max? max<<1 : 4; - sam = (char**) realloc(sam, sizeof(char*)*max); - } - l = s.l; + while (ks_getuntil(ks, KS_SEP_SPACE, &s, &dret) >= 0) + { + hts_expand(char*,(n+1),max,sam); + int l = s.l; sam[n] = (char*) malloc(sizeof(char)*(s.l+2)); strcpy(sam[n], s.s); sam[n][l+1] = 2; // by default, diploid if (dret != '\n') { - if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 + if (ks_getuntil(ks, KS_SEP_SPACE, &s, &dret) >= 0) { // read ploidy, 1 or 2 int x = (int)s.s[0] - '0'; // Convert ASCII digit to decimal if (x == 1 || x == 2) sam[n][l+1] = x; else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); @@ -141,13 +237,12 @@ static void init_data(args_t *args) // Open files for input and output, initialize structures if ( args->targets ) { - if ( bcf_sr_set_targets(args->files, args->targets,0)<0 ) + if ( bcf_sr_set_targets(args->files, args->targets, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); } if ( args->regions ) { - args->files->require_index = 1; - if ( bcf_sr_set_targets(args->files, args->regions,0)<0 ) + if ( bcf_sr_set_regions(args->files, args->regions)<0 ) error("Failed to read the targets: %s\n", args->regions); } @@ -168,11 +263,21 @@ static void init_data(args_t *args) else args->aux.hdr = bcf_hdr_dup(args->files->readers[0].header); - // Reorder ploidy to match mpileup's output and exclude samples which are not available + // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available if ( args->aux.ploidy ) { + for (i=0; iaux.nfams; i++) + { + int j; + for (j=0; j<3; j++) + { + int k = bcf_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]); + if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]); + args->aux.fams[i].sample[j] = k; + } + } uint8_t *ploidy = (uint8_t*) calloc(args->aux.hdr->n[BCF_DT_SAMPLE], 1); - for (i=0; iaux.nsamples; i++) // i index in -s sample list + for (i=0; insamples; i++) // i index in -s sample list { int j = bcf_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]); // j index in the output VCF / subset VCF if ( j<0 ) @@ -182,8 +287,8 @@ static void init_data(args_t *args) } ploidy[j] = args->aux.ploidy[i]; } - args->aux.nsamples = args->aux.hdr->n[BCF_DT_SAMPLE]; - for (i=0; iaux.nsamples; i++) + args->nsamples = args->aux.hdr->n[BCF_DT_SAMPLE]; + for (i=0; insamples; i++) assert( ploidy[i]==1 || ploidy[i]==2 ); free(args->aux.ploidy); args->aux.ploidy = ploidy; @@ -212,6 +317,11 @@ static void destroy_data(args_t *args) else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); int i; for (i=0; insamples; i++) free(args->samples[i]); + if ( args->aux.fams ) + { + for (i=0; iaux.nfams; i++) free(args->aux.fams[i].name); + free(args->aux.fams); + } free(args->samples); free(args->samples_map); free(args->aux.ploidy); @@ -223,35 +333,34 @@ static void destroy_data(args_t *args) static void usage(args_t *args) { fprintf(stderr, "\n"); + fprintf(stderr, "About: This command replaces the former \"bcftools view\" caller. Some of the original functionality has been\n"); + fprintf(stderr, " temporarily lost in the process of transition under htslib, but will be added back on popular demand. The original\n"); + fprintf(stderr, " calling model can be invoked with the -c option. Note that we use the new multiallelic -m caller by default,\n"); + fprintf(stderr, " therefore -c is not as well tested as -m. If you encounter bugs, please do let us know.\n"); fprintf(stderr, "Usage: bcftools call [options] [reg]\n"); fprintf(stderr, "File format options:\n"); - fprintf(stderr, " -o TYPE output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -Q output the QCALL likelihood format\n"); - fprintf(stderr, " -r REG restrict to comma-separated list of regions or regions listed in tab-delimited indexed file\n"); - fprintf(stderr, " -s STR comma-separated list or file name with a list of samples. Second column indicates ploidy (1 or 2)[all samples]\n"); - fprintf(stderr, " -l REG same as -r but streams rather than index-jumps to it. Coordinates are 1-based, inclusive\n"); - fprintf(stderr, " -V input is VCF\n"); + fprintf(stderr, " -o, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " -r, --region restrict to comma-separated list of regions or regions listed in tab-delimited indexed file\n"); + fprintf(stderr, " -s, --samples comma-separated list or file name with a list of samples. Second column indicates ploidy (1 or 2)[all samples]\n"); + fprintf(stderr, " -t, --targets same as -r but streams rather than index-jumps to it. Coordinates are 1-based, inclusive\n"); + fprintf(stderr, " -V, --vcf-input stdin input is VCF\n"); fprintf(stderr, "\nInput/output options:\n"); - fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -G output sites only, drop genotype fields\n"); - fprintf(stderr, " -L calculate LD for adjacent sites\n"); - fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); - fprintf(stderr, " -v output potential variant sites only\n"); + fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -N, --skip-Ns skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -S, --skip skip indels/snps\n"); + fprintf(stderr, " -v, --variants-only output variant sites only\n"); fprintf(stderr, "\nConsensus/variant calling options:\n"); - fprintf(stderr, " -c the original calling method (conflicts with -m)\n"); - fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); - fprintf(stderr, " -i FLOAT indel-to-substitution ratio (-c only) [%.4g]\n", args->aux.indel_frac); - fprintf(stderr, " -S STR skip the given variant type \n"); - fprintf(stderr, " -m alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(stderr, " -p FLOAT variant if P(ref|D)=1-FLOAT with -m [1e-2]\n"); - fprintf(stderr, " -P STR type of prior: full, cond2, flat (-c only) [full]\n"); - fprintf(stderr, " -t FLOAT scaled substitution mutation rate (-c only) [%.4g]\n", args->aux.theta); - fprintf(stderr, " -T STR constrained calling; STR can be: pair, trioauto, trioxd and trioxs (see manual) [null]\n"); - fprintf(stderr, "\nContrast calling and association test options:\n"); - fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); - fprintf(stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); - fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); - fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); + fprintf(stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); + fprintf(stderr, " -C, --constrain one of: alleles, pair, trio, triox (see manual)\n"); + fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(stderr, " -p, --pval-threshold variant if P(ref|D)=1-FLOAT with -m [1e-2]\n"); + + // todo (and more) + // fprintf(stderr, "\nContrast calling and association test options:\n"); + // fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); + // fprintf(stderr, " -C FLOAT posterior constrast for LRTaux.min_lrt); + // fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); + // fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)aux.min_perm_p); fprintf(stderr, "\n"); exit(-1); } @@ -272,19 +381,30 @@ int main_vcfcall(int argc, char *argv[]) float p_arg = -1; int i, c; - // Note: Some of the functionality was lost in the transition but will be put back on demand (pd3 todo) - - while ((c = getopt(argc, argv, "N1:l:cC:AGvVP:t:p:QLi:S:s:U:X:d:T:mr:l:o:")) >= 0) + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"output-type",1,0,'o'}, + {"region",1,0,'r'}, + {"samples",1,0,'s'}, + {"targets",1,0,'t'}, + {"vcf-input",0,0,'V'}, + {"keep-alts",0,0,'A'}, + {"skip-Ns",0,0,'N'}, + {"skip",1,0,'S'}, + {"variants-only",0,0,'v'}, + {"consensus-caller",0,0,'c'}, + {"constrain",1,0,'C'}, + {"multiallelic-caller",0,0,'m'}, + {"pval-threshold",1,0,'p'}, + {0,0,0,0} + }; + + while ((c = getopt_long(argc, argv, "h?o:r:s:t:VANS:vcmp:C:", loptions, NULL)) >= 0) { switch (c) { case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N - case '1': args.aux.ngrp1_samples = atoi(optarg); break; // is anyone using this? Please expand docs - // case 'l': args.bed = bed_read(optarg); // todo: with few sites, use index to jump rather than stream; genotypes given alleles - // if (!args.bed) - // error("Could not read \"%s\"\n", optarg); - // break; - case 'G': args.flag |= CF_NO_GENO; break; // output only sites, no genotype fields case 'A': args.aux.flag |= CALL_KEEPALT; break; case 'V': args.flag |= CF_VCFIN; break; case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method @@ -298,46 +418,24 @@ int main_vcfcall(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); } break; + case 'C': + if ( !strcasecmp(optarg,"alleles") ) args.aux.flag |= CALL_CONSTR_ALLELES; + else if ( !strcasecmp(optarg,"trio") ) args.aux.flag |= CALL_CONSTR_TRIO; + else error("Unknown argument to -C: \"%s\"\n", optarg); + break; case 'S': if ( !strcasecmp(optarg,"snps") ) args.flag |= CF_INDEL_ONLY; else if ( !strcasecmp(optarg,"indels") ) args.flag |= CF_NO_INDEL; else error("Unknown argument to -I: \"%s\"\n", optarg); case 'm': args.flag |= CF_MCALL; break; // multiallelic calling method - case 't': args.aux.theta = atof(optarg); break; case 'p': p_arg = atof(optarg); break; - case 'i': args.aux.indel_frac = atof(optarg); break; - case 'Q': args.flag |= CF_QCALL; break; - case 'L': args.flag |= CF_ADJLD; break; - case 'U': args.aux.n_perm = atoi(optarg); break; - case 'C': args.aux.min_lrt = atof(optarg); break; - case 'X': args.aux.min_perm_p = atof(optarg); break; - // case 'd': args.aux.min_smpl_frac = atof(optarg); break; // todo case 'r': args.regions = optarg; break; - case 'l': args.targets = optarg; break; + case 't': args.targets = optarg; break; case 's': - args.samples = read_samples(optarg, &args.nsamples); - args.aux.nsamples = args.nsamples; + args.samples = read_samples(&args.aux, optarg, &args.nsamples); args.aux.ploidy = (uint8_t*) calloc(args.nsamples+1, 1); for (i=0; i not supported for now - // case 'Y': args.flag |= CF_QCNT; break; // undocumented -> not supported for now - // case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break; // undocumented -> not supported for now default: usage(&args); } } @@ -349,6 +447,11 @@ int main_vcfcall(int argc, char *argv[]) if ( !(args.flag & CF_CCALL) && !(args.flag & CF_MCALL) && !(args.flag & CF_QCALL) ) error("Expected one of -c, -m, or -Q options\n"); if ( args.aux.n_perm && args.aux.ngrp1_samples<=0 ) error("Expected -1 with -U\n"); // not sure about this, please fix if ( p_arg!=-1 ) { args.aux.pref = p_arg; args.aux.min_ma_lrt = 1 - p_arg; } // only one is actually used + if ( args.aux.flag & CALL_CONSTR_ALLELES ) + { + if ( !args.targets ) error("Expected -t with \"-C alleles\"\n"); + if ( !(args.flag & CF_MCALL) ) error("The \"-C alleles\" mode requires -m\n"); + } init_data(&args);