diff --git a/bwamem.c b/bwamem.c index 84c84241..42788c49 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; o->pen_unpaired = 9; o->pen_clip = 5; o->min_seed_len = 19; @@ -492,7 +492,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, k; + int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -549,16 +549,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); + a->w = aw[0] = aw[1] = opt->w; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore; + int qle, tle, gtle, gscore, tmps = -1; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + for (aw[0] = opt->w;; aw[0] <<= 1) { + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; + tmps = a->score; + } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end @@ -566,16 +572,21 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore; + int qle, tle, qe, re, gtle, gscore, tmps = -1; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + for (aw[1] = opt->w;; aw[1] <<= 1) { + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], a->score, &qle, &tle, >le, &gscore, &max_off[1]); + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; + tmps = a->score; + } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) { printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } + if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { @@ -583,6 +594,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } + a->w = aw[0] > aw[1]? aw[0] : aw[1]; } free(srt); free(rseq); } @@ -750,7 +762,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); if (k == 0) mapq0 = h.qual; else if (h.qual > mapq0) h.qual = mapq0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); s->sam = str.s; diff --git a/bwamem.h b/bwamem.h index 3f4ce322..4d632e74 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,6 +22,7 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int max_w; // max band width int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length @@ -45,6 +46,7 @@ typedef struct { int sub; // 2nd best SW score int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; diff --git a/ksw.c b/ksw.c index b97fed5d..3747cb04 100644 --- a/ksw.c +++ b/ksw.c @@ -359,11 +359,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -386,6 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, w = w < max_gap? w : max_gap; // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -410,7 +411,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h = h > e? h : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column - mj = m > h? mj : j; + mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] h -= gapoe; h = h > 0? h : 0; @@ -426,7 +427,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, gscore = gscore > h1? gscore : h1; } if (m == 0) break; - if (m > max) max = m, max_i = i, max_j = mj; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } // update beg and end for the next round for (j = mj; j >= beg && eh[j].h; --j); beg = j + 1; @@ -439,6 +443,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; return max; } diff --git a/ksw.h b/ksw.h index d2975dec..6d1f7cfa 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus }