From 70fd9a0b1fea45e442eb5f331922ea91ef4f71ae Mon Sep 17 00:00:00 2001 From: chhylp123 Date: Mon, 6 May 2024 04:02:20 -0400 Subject: [PATCH] keeping more telomeres --- CommandLines.cpp | 46 +++++ CommandLines.h | 8 +- Overlaps.cpp | 467 ++++++++++++++++++++++++++++++++++++++++------- Overlaps.h | 8 +- gfa_ut.cpp | 152 ++++++++++----- gfa_ut.h | 6 +- 6 files changed, 570 insertions(+), 117 deletions(-) diff --git a/CommandLines.cpp b/CommandLines.cpp index 3da5ec4..41c0bf9 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -66,6 +66,11 @@ static ko_longopt_t long_options[] = { { "scaf-gap", ko_required_argument, 351}, { "sec-in", ko_required_argument, 352}, { "somatic-cov", ko_required_argument, 353}, + { "telo-m", ko_required_argument, 354}, + { "telo-p", ko_required_argument, 355}, + { "telo-d", ko_required_argument, 356}, + { "telo-s", ko_required_argument, 357}, + { "ctg-n", ko_required_argument, 358}, // { "path-round", ko_required_argument, 348}, { 0, 0, 0 } }; @@ -121,6 +126,9 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " break contigs at positions with <=FLOAT*coverage exact overlaps;\n"); fprintf(stderr, " only work with '--b-cov' or '--h-cov'[%.2f]\n", asm_opt->m_rate); fprintf(stderr, " --primary output a primary assembly and an alternate assembly\n"); + fprintf(stderr, " --ctg-n INT\n"); + fprintf(stderr, " remove tip contigs composed of <=INT reads [%d]\n", asm_opt->max_contig_tip); + // fprintf(stderr, " --pri-range INT1[,INT2]\n"); // fprintf(stderr, " keep contigs with coverage in this range in p_ctg.gfa; -1 to disable [auto,inf]\n"); @@ -188,6 +196,17 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " --scaf-gap INT\n"); fprintf(stderr, " max gap size for scaffolding [%ld]\n", asm_opt->self_scaf_gap_max); + fprintf(stderr, " Telomere-identification:\n"); + fprintf(stderr, " --telo-m STR\n"); + fprintf(stderr, " telomere motif at 5'-end; CCCTAA for human [%s]\n", ((asm_opt->telo_motif)?(asm_opt->telo_motif):("NULL")));///5'-end, check CCCTAA + fprintf(stderr, " --telo-p INT\n"); + fprintf(stderr, " non-telomeric penalty [%ld]\n", asm_opt->telo_pen); + fprintf(stderr, " --telo-d INT\n"); + fprintf(stderr, " max drop [%ld]\n", asm_opt->telo_drop); + fprintf(stderr, " --telo-s INT\n"); + fprintf(stderr, " min score for telomere reads [%ld]\n", asm_opt->telo_mic_sc); + + fprintf(stderr, "Example: ./hifiasm -o NA12878.asm -t 32 NA12878.fq.gz\n"); fprintf(stderr, "See `https://hifiasm.readthedocs.io/en/latest/' or `man ./hifiasm.1' for complete documentation.\n"); } @@ -244,6 +263,7 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->min_overlap_coverage = 0; asm_opt->max_short_tip = 3; asm_opt->max_short_ul_tip = 6; + asm_opt->max_contig_tip = 3; asm_opt->min_cnt = 2; asm_opt->mid_cnt = 5; asm_opt->purge_level_primary = 3; @@ -309,6 +329,11 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->self_scaf_gap_max = 3000000; asm_opt->sec_in = NULL; asm_opt->somatic_cov = -1; + + asm_opt->telo_motif = NULL; + asm_opt->telo_pen = 1; + asm_opt->telo_drop = 2000; + asm_opt->telo_mic_sc = 500; } void destory_enzyme(enzyme* f) @@ -638,6 +663,22 @@ int check_option(hifiasm_opt_t* asm_opt) return 0; } + if(asm_opt->telo_motif) { + uint64_t k, tlen = strlen((asm_opt->telo_motif)); char c; + if(tlen > 32) { + fprintf(stderr, "[ERROR] [--telo-m] must be no longer than 32\n"); + return 0; + } + for (k = 0; k < tlen; k++) { + c = asm_opt->telo_motif[k]; + if(c != 'A' && c != 'C' && c != 'G' && c != 'T' && + c != 'a' && c != 'c' && c != 'g' && c != 't') { + fprintf(stderr, "[ERROR] [--telo-m] must be A/C/G/T\n"); + return 0; + } + } + } + return 1; } @@ -859,6 +900,11 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) else if (c == 351) asm_opt->self_scaf_gap_max = atol(opt.arg); else if (c == 352) get_hic_enzymes(opt.arg, &(asm_opt->sec_in), 0); else if (c == 353) asm_opt->somatic_cov = atol(opt.arg); + else if (c == 354) asm_opt->telo_motif = opt.arg; + else if (c == 355) asm_opt->telo_pen = atol(opt.arg); + else if (c == 356) asm_opt->telo_drop = atol(opt.arg); + else if (c == 357) asm_opt->telo_mic_sc = atol(opt.arg); + else if (c == 358) asm_opt->max_contig_tip = atol(opt.arg); else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg); } diff --git a/CommandLines.h b/CommandLines.h index bf10db1..e8b9b9d 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.19.9-r606" +#define HA_VERSION "0.19.9-r616" #define VERBOSE 0 @@ -83,6 +83,7 @@ typedef struct { int min_overlap_coverage; int max_short_tip; int max_short_ul_tip; + int max_contig_tip; int min_cnt; int mid_cnt; int purge_level_primary; @@ -153,6 +154,11 @@ typedef struct { uint64_t self_scaf_reliable_min; int64_t self_scaf_gap_max; int64_t somatic_cov; + + char *telo_motif; + int64_t telo_pen; + int64_t telo_drop; + int64_t telo_mic_sc; } hifiasm_opt_t; extern hifiasm_opt_t asm_opt; diff --git a/Overlaps.cpp b/Overlaps.cpp index cecf95b..fb5a169 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -170,17 +170,12 @@ typedef struct { } kvect_sec_t; -typedef struct { - uint64_t n, mask; - uint8_t *hh; - uint64_t tlen, tm; -} telo_end_t; - typedef struct { All_reads *Rinf; UC_Read *aux; telo_end_t *u; khash_t(64) *h; + int64_t min_sc, penalty, max_drop; } telo_end_pip_t; ///this value has been updated at the first line of build_string_graph_without_clean @@ -193,11 +188,120 @@ kv_u_trans_t *get_utg_ovlp(ma_ug_t **ug, asg_t* read_g, ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp, kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t, uint8_t* r_het); void delete_useless_nodes(ma_ug_t **ug); -telo_end_t* gen_telo_end_t(All_reads *in, const char* motif, uint64_t motif_len, uint64_t n_thread) +static void mark_telo_ends(void *data, long i, int tid) // callback for kt_for() +{ + telo_end_pip_t *sl = (telo_end_pip_t *)data; + int64_t k, l, rlen, tlen, sc, max_sc, pen = sl->penalty, max_drop = sl->max_drop, min_sc = sl->min_sc, z; + uint64_t rid = i, c, x, mask = sl->u->mask, hit, xz; + UC_Read *rr = &(sl->aux[tid]); rlen = Get_READ_LENGTH((*(sl->Rinf)), rid); sl->u->hh[rid] = 0; + recover_UC_Read(rr, sl->Rinf, rid); assert((rr->length) == rlen); tlen = sl->u->tlen; + + for (k = l = 0, sc = max_sc = x = 0; k < rlen; k++) { // 5'-end, check CCCTAA + hit = 0; c = seq_nt6_table[(uint8_t)rr->seq[k]]; + if (c >= 0 && c <= 3) { // not N + x = (x<<2 | (c)) & mask; + if (((++l) >= tlen) && (kh_get(64, sl->h, x) != kh_end(sl->h))) {// x is at least 6bp long and is a telomere motif + hit = 1; + } + } else {l = 0, x = 0;} // N, ambiguous base + if(k >= tlen) sc += ((hit)?(1):(-pen)); + if (sc > max_sc) {max_sc = sc;} + else if ((max_sc - sc) > max_drop) {break;} + + if(max_sc >= min_sc) break; + } + + if(max_sc >= min_sc) { + for (k = l = 0, sc = max_sc = x = 0, xz = ((uint64_t)-1); k < rlen; k++) { // 5'-end, check CCCTAA + hit = 0; c = seq_nt6_table[(uint8_t)rr->seq[k]]; + if (c >= 0 && c <= 3) { // not N + x = (x<<2 | (c)) & mask; + if (((++l) >= tlen) && (kh_get(64, sl->h, x) != kh_end(sl->h))) {// x is at least 6bp long and is a telomere motif + if((xz != ((uint64_t)-1)) && (xz == x)) { + hit = 1; + } else { + for (z = 0, xz = sl->u->tm; (z < tlen) && (xz != x); z++) { + xz = (((xz>>((tlen-1)<<1))&(3ULL))|(xz<<2))&mask; + } + if((z < tlen) && (xz == x)) hit = 1; + } + } + } else {l = 0, x = 0;} // N, ambiguous base + + if(!hit) { + xz = ((uint64_t)-1); + } else { + xz = (((x>>((tlen-1)<<1))&(3ULL))|(x<<2))&mask; + } + if(k >= tlen) sc += ((hit)?(1):(-pen)); + if (sc > max_sc) {max_sc = sc;} + else if ((max_sc - sc) > max_drop) {break;} + + if(max_sc >= min_sc) break; + } + + if(max_sc >= min_sc) sl->u->hh[rid] |= 1; + } + + + + + + + for (k = rlen-1, l = 0, sc = max_sc = x = 0; k >= 0; --k) { // 3'-end + hit = 0; c = seq_nt6_table[(uint8_t)(RC_CHAR(rr->seq[k]))]; + if (c >= 0 && c <= 3) { // not N + x = (x<<2 | (c)) & mask; + if (((++l) >= tlen) && (kh_get(64, sl->h, x) != kh_end(sl->h))) {// x is at least 6bp long and is a telomere motif + hit = 1; + } + } else {l = 0, x = 0;} // N, ambiguous base + if((rlen - k) >= tlen) sc += ((hit)?(1):(-pen)); + if (sc > max_sc) {max_sc = sc;} + else if ((max_sc - sc) > max_drop) {break;} + + if(max_sc >= min_sc) break; + } + + if(max_sc >= min_sc) { + for (k = rlen-1, l = 0, sc = max_sc = x = 0, xz = ((uint64_t)-1); k >= 0 ; --k) { // 3'-end + hit = 0; c = seq_nt6_table[(uint8_t)(RC_CHAR(rr->seq[k]))]; + if (c >= 0 && c <= 3) { // not N + x = (x<<2 | (c)) & mask; + if (((++l) >= tlen) && (kh_get(64, sl->h, x) != kh_end(sl->h))) {// x is at least 6bp long and is a telomere motif + if((xz != ((uint64_t)-1)) && (xz == x)) { + hit = 1; + } else { + for (z = 0, xz = sl->u->tm; (z < tlen) && (xz != x); z++) { + xz = (((xz>>((tlen-1)<<1))&(3ULL))|(xz<<2))&mask; + } + if((z < tlen) && (xz == x)) hit = 1; + } + } + } else {l = 0, x = 0;} // N, ambiguous base + + if(!hit) { + xz = ((uint64_t)-1); + } else { + xz = (((x>>((tlen-1)<<1))&(3ULL))|(x<<2))&mask; + } + if((rlen - k) >= tlen) sc += ((hit)?(1):(-pen)); + if (sc > max_sc) {max_sc = sc;} + else if ((max_sc - sc) > max_drop) {break;} + + if(max_sc >= min_sc) break; + } + + if(max_sc >= min_sc) sl->u->hh[rid] |= 2; + } +} + + +telo_end_t* gen_telo_end_t(All_reads *in, const char* motif, int64_t min_sc, int64_t penalty, int64_t max_drop, uint64_t n_thread) { uint64_t j, k, c, x; int absent; telo_end_t* p = NULL; CALLOC(p, 1); - p->tlen = strlen(motif); p->mask = (1ULL<<(p->tlen<<1)) - 1; + p->tlen = strlen(motif); p->mask = ((1ULL)<<(p->tlen<<1))-1; for (j = 0, p->tm = 0; j < p->tlen; ++j) { c = seq_nt6_table[(uint8_t)motif[j]]; @@ -206,23 +310,47 @@ telo_end_t* gen_telo_end_t(All_reads *in, const char* motif, uint64_t motif_len, } p->n = in->total_reads; CALLOC(p->hh, p->n); - telo_end_pip_t *aux; CALLOC(aux, 1); + telo_end_pip_t *aux; CALLOC(aux, 1); + aux->min_sc = min_sc; aux->max_drop = max_drop; + aux->penalty = penalty; if(aux->penalty < 0) aux->penalty = (aux->penalty)*-1; aux->Rinf = in; aux->u = p; CALLOC(aux->aux, n_thread); for (k = 0; k < n_thread; k++) init_UC_Read(&(aux->aux[k])); - aux->h = kh_init(64); // hash table for all roations of the telomere motif - kh_resize(64, aux->h, (p->tlen*2)); + aux->h = kh_init(64); kh_resize(64, aux->h, (p->tlen*2)); // hash table for all roations of the telomere motif for (k = 0, x = p->tm; k < p->tlen; k++) { kh_put(64, aux->h, x, &absent); x = (((x>>((p->tlen-1)<<1))&(3ULL))|(x<<2))&p->mask; } assert(x == p->tm); + kt_for(n_thread, mark_telo_ends, aux, p->n); + + for (k = 0; k < n_thread; k++) destory_UC_Read(&(aux->aux[k])); + free(aux->aux); kh_destroy(64, aux->h); free(aux); + uint64_t t3, t5; + for (k = t3 = t5 = 0; k < p->n; k++) { + if(p->hh[k]&1) { + // fprintf(stderr, "%.*s(+)\n", (int)Get_NAME_LENGTH((*in), k), Get_NAME((*in), k)); + t5++; + } + if(p->hh[k]&2) { + // fprintf(stderr, "%.*s(-)\n", (int)Get_NAME_LENGTH((*in), k), Get_NAME((*in), k)); + t3++; + } + } + + fprintf(stderr, "[M::%s::] ==> # 5'-telomeres::%lu, # 3'-telomeres::%lu, # tot::%lu, motif::%s, motif_len::%lu\n", __func__, t5, t3, p->n, motif, p->tlen); + // exit(1); return p; } +void destory_telo_end_t(telo_end_t *p) +{ + free(p->hh); +} + void init_bub_label_t(bub_label_t* x, uint32_t n_thres, uint32_t n_reads) { uint32_t i; @@ -16498,6 +16626,45 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex) free(gfa_name); } +void print_simple_dbg_gfa(asg_t *g, const char* prt) +{ + char* gfa_name = (char*)malloc(strlen(prt)+50); + sprintf(gfa_name, "%s.dbg.noseq.gfa", prt); + FILE *fp = fopen(gfa_name, "w"); + uint32_t i, j; char name[32]; + for (i = 0; i < g->n_seq; ++i) { // the Segment lines in GFA + if(g->seq[i].del) continue; + sprintf(name, "utg%.6dl", i + 1); + fprintf(fp, "S\t%s\t*\tLN:i:%d\trd:i:%u\n", name, g->seq[i].len, 0); + } + + asg_arc_t* au = NULL; uint32_t nu, u, v; + for (i = 0; i < g->n_seq; ++i) { + if(g->seq[i].del) continue; + + u = i<<1; + au = asg_arc_a(g, u); nu = asg_arc_n(g, u); + for (j = 0; j < nu; j++) { + if(au[j].del) continue; + v = au[j].v; + fprintf(fp, "L\tutg%.6dl\t%c\tutg%.6dl\t%c\t%dM\tL1:i:%d\tL2:i:%u\n", + (u>>1)+1, "+-"[u&1], (v>>1)+1, "+-"[v&1], au[j].ol, asg_arc_len(au[j]), 0/**au[j].ou**/); + } + + + u = (i<<1) + 1; + au = asg_arc_a(g, u); nu = asg_arc_n(g, u); + for (j = 0; j < nu; j++) { + if(au[j].del) continue; + v = au[j].v; + fprintf(fp, "L\tutg%.6dl\t%c\tutg%.6dl\t%c\t%dM\tL1:i:%d\tL2:i:%u\n", + (u>>1)+1, "+-"[u&1], (v>>1)+1, "+-"[v&1], au[j].ol, asg_arc_len(au[j]), 0/**au[j].ou**/); + } + } + + fclose(fp); free(gfa_name); +} + ma_ug_t *get_poly_ug(asg_t *sg, ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, int max_hang, int min_ovlp, R_to_U* ruIndex, bub_label_t* b_mask_t) { @@ -19602,6 +19769,65 @@ void destroy_rd_hamming_fly_simp_t(rd_hamming_fly_simp_t *p) free(p->srt); } +static void dbg_asys_gfa0(void *data, long i, int tid) // callback for kt_for() +{ + asg_t *g = (asg_t *)data; asg_arc_t *s = &(g->arc[i]), *ra; + if(s->del) return; + uint64_t rn, ri, v, w; + ra = asg_arc_a(g, (s->v^1)); rn = asg_arc_n(g, (s->v^1)); + for (ri = 0; ri < rn; ri++) { + if(ra[ri].del) continue; + if(ra[ri].v == ((s->ul>>32)^1)) break; + } + if(ri >= rn) { + v = s->ul>>32; w = s->v; + fprintf(stderr, "[M::%s] v::utg%.6lul(%c)\tw::utg%.6lul(%c)\n", __func__, (v>>1) + 1, "+-"[(v&1)], (w>>1) + 1, "+-"[(w&1)]); + exit(1); + } +} + +static void dbg_asys_gfa1(void *data, long i, int tid) // callback for kt_for() +{ + asg_t *g = (asg_t *)data; asg_arc_t *av, *ra; + uint64_t rn, ri, v, w, an, k; + v = i<<1; av = asg_arc_a(g, v); an = asg_arc_n(g, v); + for (k = 0; k < an; k++) { + if(av[k].del) continue; + ra = asg_arc_a(g, (av[k].v^1)); rn = asg_arc_n(g, (av[k].v^1)); + for (ri = 0; ri < rn; ri++) { + if(ra[ri].del) continue; + if(ra[ri].v == ((av[k].ul>>32)^1)) break; + } + if(ri >= rn) { + w = av[k].v; + fprintf(stderr, "[M::%s] v::utg%.6lul(%c)\tw::utg%.6lul(%c)\n", __func__, (v>>1) + 1, "+-"[(v&1)], (w>>1) + 1, "+-"[(w&1)]); + exit(1); + } + + } + + v = (i<<1) + 1; av = asg_arc_a(g, v); an = asg_arc_n(g, v); + for (k = 0; k < an; k++) { + if(av[k].del) continue; + ra = asg_arc_a(g, (av[k].v^1)); rn = asg_arc_n(g, (av[k].v^1)); + for (ri = 0; ri < rn; ri++) { + if(ra[ri].del) continue; + if(ra[ri].v == ((av[k].ul>>32)^1)) break; + } + if(ri >= rn) { + w = av[k].v; + fprintf(stderr, "[M::%s] v::utg%.6lul(%c)\tw::utg%.6lul(%c)\n", __func__, (v>>1) + 1, "+-"[(v&1)], (w>>1) + 1, "+-"[(w&1)]); + exit(1); + } + } +} + +void dbg_asys_gfa(asg_t *g) +{ + kt_for(asm_opt.thread_num, dbg_asys_gfa0, g, g->n_arc); + kt_for(asm_opt.thread_num, dbg_asys_gfa1, g, g->n_seq); +} + void clean_trio_untig_graph(ma_ug_t *ug, asg_t *read_g, ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, @@ -19621,16 +19847,16 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) // if(trio_flag == MOTHER) print_untig((ug), 10, "i-0:", 0); ///debug // if(!p) p = gen_rd_hamming_fly_simp_t(ug, read_g, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); - // fprintf(stderr, "[M::%s] 0\n", __func__); + // fprintf(stderr, "[M::%s] 0\n", __func__); dbg_asys_gfa(g); asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1, p); ///do not need to refine bubbles during the first round of cleaning if(!p) p = gen_rd_hamming_fly_simp_t(ug, read_g, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, ae); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-1:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-1:", 0); - // fprintf(stderr, "[M::%s] 1\n", __func__); + // fprintf(stderr, "[M::%s] 1\n", __func__); dbg_asys_gfa(g); magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); - // fprintf(stderr, "[M::%s] 2\n", __func__); + // fprintf(stderr, "[M::%s] 2\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-2:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-2:", 0); @@ -19639,7 +19865,7 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) { cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); } - // fprintf(stderr, "[M::%s] 3\n", __func__); + // fprintf(stderr, "[M::%s] 3\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-3:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-3:", 0); /**********debug**********/ @@ -19647,16 +19873,16 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) long long cur_cons = 0; while(pre_cons != cur_cons) { - // fprintf(stderr, "[M::%s] 4\n", __func__); + // fprintf(stderr, "[M::%s] 4\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-4:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-4:", 0); pre_cons = get_graph_statistic(g); - // fprintf(stderr, "[M::%s] 5\n", __func__); + // fprintf(stderr, "[M::%s] 5\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-5:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-5:", 0); ///need consider tangles asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1, p); - // fprintf(stderr, "[M::%s] 6\n", __func__); + // fprintf(stderr, "[M::%s] 6\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-6:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-6:", 0); /**********debug**********/ @@ -19666,50 +19892,50 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) asg_arc_cut_trio_long_tip_primary(g, ug, read_g, reverse_sources, ruIndex, 2, tip_drop_ratio, trio_flag, cov, NULL); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-7:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-7:", 0); - // fprintf(stderr, "[M::%s] 7\n", __func__); + // fprintf(stderr, "[M::%s] 7\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); asg_arc_cut_trio_long_equal_tips_assembly(g, ug, read_g, reverse_sources, 2, ruIndex, trio_flag, cov, NULL); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-8:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-8:", 0); - // fprintf(stderr, "[M::%s] 8\n", __func__); + // fprintf(stderr, "[M::%s] 8\n", __func__); dbg_asys_gfa(g); asg_arc_cut_trio_long_tip_primary_complex(g, ug, read_g, reverse_sources, ruIndex, 2, tip_drop_ratio, stops_threshold, cov, NULL, trio_flag); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-9:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-9:", 0); - // fprintf(stderr, "[M::%s] 9\n", __func__); + // fprintf(stderr, "[M::%s] 9\n", __func__); dbg_asys_gfa(g); asg_arc_cut_trio_long_equal_tips_assembly_complex(g, ug, read_g, reverse_sources, 2, ruIndex, stops_threshold, cov, NULL, trio_flag); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-10:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-10:", 0); - // fprintf(stderr, "[M::%s] 10\n", __func__); + // fprintf(stderr, "[M::%s] 10\n", __func__); dbg_asys_gfa(g); detect_chimeric_by_topo(g, ug, read_g, reverse_sources, 2, stops_threshold, chimeric_rate, ruIndex, NULL, cov->is_r_het); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-11:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-11:", 0); - // fprintf(stderr, "[M::%s] 11\n", __func__); + // fprintf(stderr, "[M::%s] 11\n", __func__); dbg_asys_gfa(g); ///need consider tangles ///note we need both the read graph and the untig graph } /**********debug**********/ cur_cons = get_graph_statistic(g); - // fprintf(stderr, "[M::%s] 12\n", __func__); + // fprintf(stderr, "[M::%s] 12\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-12:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-12:", 0); } if(just_bubble_pop == 0) { - // fprintf(stderr, "[M::%s] 13\n", __func__); + // fprintf(stderr, "[M::%s] 13\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-13:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-13:", 0); cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-14:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-14:", 0); - // fprintf(stderr, "[M::%s] 14\n", __func__); + // fprintf(stderr, "[M::%s] 14\n", __func__); dbg_asys_gfa(g); } // print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); - // fprintf(stderr, "[M::%s] 15\n", __func__); + // fprintf(stderr, "[M::%s] 15\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-15:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-15:", 0); magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); - // fprintf(stderr, "[M::%s] 16\n", __func__); + // fprintf(stderr, "[M::%s] 16\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-16:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-16:", 0); @@ -19719,15 +19945,15 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) resolve_tangles(ug, read_g, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, cov->is_r_het, trio_flag, drop_ratio); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-17:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-17:", 0); - // fprintf(stderr, "[M::%s] 17\n", __func__); + // fprintf(stderr, "[M::%s] 17\n", __func__); dbg_asys_gfa(g); drop_semi_circle(ug, g, read_g, reverse_sources, ruIndex, cov->is_r_het); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-18:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-18:", 0); - // fprintf(stderr, "[M::%s] 18\n", __func__); + // fprintf(stderr, "[M::%s] 18\n", __func__); dbg_asys_gfa(g); all_to_all_deduplicate(ug, read_g, coverage_cut, sources, trio_flag, trio_drop_rate, reverse_sources, ruIndex, cov->is_r_het, DOUBLE_CHECK_THRES, asm_opt.trio_flag_occ_thres); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-19:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-19:", 0); - // fprintf(stderr, "[M::%s] 19\n", __func__); + // fprintf(stderr, "[M::%s] 19\n", __func__); dbg_asys_gfa(g); // if(trio_flag == MOTHER) print_untig_by_read(ug, "m54329U_190827_173812/30214441/ccs", (uint32_t)-1, NULL, NULL, "bf-16"); if(is_first) { @@ -19735,7 +19961,7 @@ int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) unitig_arc_del_short_diploid_by_length(ug->g, drop_ratio); // if(trio_flag == MOTHER) print_untig((ug), 9, "i-20:", 0); // if(trio_flag == MOTHER) print_untig((ug), 10, "i-20:", 0); - // fprintf(stderr, "[M::%s] 20\n", __func__); + // fprintf(stderr, "[M::%s] 20\n", __func__); dbg_asys_gfa(g); goto redo; } if(p) { @@ -20734,6 +20960,23 @@ void purge_dump(ma_ug_t* ug) asg_cleanup(nsg); } +void discard_small_ctg(ma_ug_t **ug, asg_t* rg, kvec_asg_arc_t_warp* edge, int32_t max_rg_cut, uint32_t is_renew0, uint32_t is_renew1) +{ + if(is_renew0) renew_utg(ug, rg, edge); + uint32_t i, cnt = 0; ma_utg_t* u = NULL; + for (i = 0; i < (*ug)->u.n; i++) { + if(((int32_t)(*ug)->u.a[i].n) > max_rg_cut) continue; + u = &(((*ug))->u.a[i]); + asg_seq_del((*ug)->g, i); + if(u->m!=0) { + u->m = u->n = 0; free(u->a); u->a = NULL; u->len = u->circ = 0; u->start = u->end = 0; + } + cnt++; + } + + if(cnt && is_renew1) renew_utg(ug, rg, edge); +} + void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_rate, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, @@ -20764,14 +21007,16 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) fprintf(stderr, "[M::%s] primary contig coverage range: [%d, infinity]\n", __func__, asm_opt.recover_atg_cov_min); } - - // fprintf(stderr, "[M::%s] 0\n", __func__); + + // fprintf(stderr, "[M::%s] 0\n", __func__); dbg_asys_gfa((*ug)->g); adjust_utg_advance(read_g, (*ug), reverse_sources, ruIndex, b_mask_t, cov->is_r_het); - // fprintf(stderr, "[M::%s] 1\n", __func__); + // fprintf(stderr, "[M::%s] 1\n", __func__); dbg_asys_gfa((*ug)->g); + ///primary_flag = get_utg_attributes(*ug, read_g, coverage_cut, sources, ruIndex); update_unitig_graph((*ug), read_g, coverage_cut, sources, reverse_sources, ruIndex, cov->is_r_het, 0, DOUBLE_CHECK_THRES, flag, drop_rate); - // fprintf(stderr, "[M::%s] 2\n", __func__); + // fprintf(stderr, "[M::%s] 2\n", __func__); dbg_asys_gfa((*ug)->g); + nsg = (*ug)->g; n_vtx = nsg->n_seq; for (v = 0; v < n_vtx; ++v) @@ -20848,8 +21093,12 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) } // fprintf(stderr, "[M::%s] 18\n", __func__); - set_drop_trio_flag(*ug); + if(asm_opt.max_contig_tip > 0) { + discard_small_ctg(ug, read_g, new_rtg_edges, asm_opt.max_contig_tip, 1, 0); + } + // fprintf(stderr, "[M::%s] 19\n", __func__); + set_drop_trio_flag(*ug); destory_hap_cov_t(&cov); // fprintf(stderr, "[M::%s] 20\n", __func__); // purge_dump(*ug); @@ -25578,7 +25827,9 @@ hap_cov_t *cov, uint32_t is_update_chain, uint32_t keep_d, utg_trans_t *o) binfo_t *t = &b->a[w]; ///that means there is a circle, directly terminate the whole bubble poping ///if (w == v0) goto pop_reset; - if ((w>>1) == (v0>>1)) goto pop_reset; + if ((w>>1) == (v0>>1)) { + goto pop_reset; + } /****************************may have bugs********************************/ ///important when poping at long untig graph if(is_first && keep_d) l = 0; @@ -25695,7 +25946,9 @@ hap_cov_t *cov, uint32_t is_update_chain, uint32_t keep_d, utg_trans_t *o) else { ///at most one tip - if(n_tips != 0) goto pop_reset; + if(n_tips != 0) { + goto pop_reset; + } n_tips++; tip_end = w; } @@ -25720,7 +25973,9 @@ hap_cov_t *cov, uint32_t is_update_chain, uint32_t keep_d, utg_trans_t *o) } /****************************may have bugs for bubble********************************/ ///if i < nv, that means (d + l > max_dist) - if (i < nv || b->S.n == 0) goto pop_reset; + if (i < nv || b->S.n == 0) { + goto pop_reset; + } } while (b->S.n > 1 || n_pending); @@ -26208,6 +26463,7 @@ void append_node_arcs(asg_t *des, asg_t *src, uint8_t *s, uint8_t se, uint32_t v ///s[av[k].v]&se:: in the existing graph if(s[av[k].v]&se) { av[k].del = 0; n1++; + asg_arc_del(des, av[k].v^1, (av[k].ul>>32)^1, 0); } } @@ -26293,7 +26549,7 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, uint32_t is_upda ma_hit_t_alloc* src, ma_sub_t *sub, int32_t max_hang, int32_t min_ovlp, int32_t gap_fuzz, uint32_t *n_insert) { ///do not pop bubble within new_ug; - uint32_t is_pop = asg_bub_pop1_primary_trio(new_ug->g, new_ug, v0, max_dist, b, positive_flag, negative_flag, 0, NULL, NULL, cov, is_update_chain, 0, o); + uint32_t is_pop = asg_bub_pop1_primary_trio(new_ug->g, new_ug, v0, max_dist, b, positive_flag, negative_flag, 0, NULL, NULL, cov, is_update_chain, 1/**0**/, o); assert(is_pop); assert(b->S.a[0] == v1); ///b->S.a[0] is the sink of this bubble @@ -26348,10 +26604,16 @@ uint64_t renew_phase_bubble(rd_hamming_fly_simp_t *pf, uint64_t v0, buf_t *b, ma uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain) { uint64_t v, k, i, v1 = b->S.a[0], is_update, is_pop = 0; ma_ug_t *fg = pf->fg; - uint8_t *s = pf->vs; asg32_v *bc = pf->srt; uint8_t sn = 1, se = 2; + uint8_t *s = pf->vs; asg32_v *bc = pf->srt; uint8_t sn = 1, se = 2; uint64_t max_dist0 = max_dist; bc->n = 0; kv_resize(uint32_t, (*bc), b->b.n); assert((fg->u.a[v0>>1].len == ug->u.a[v0>>1].len) && (fg->u.a[v0>>1].n == ug->u.a[v0>>1].n)); assert((fg->u.a[v1>>1].len == ug->u.a[v1>>1].len) && (fg->u.a[v1>>1].n == ug->u.a[v1>>1].n)); + + + // if(((v0>>1) == 7536) && ((v1>>1) == 99223)) { + // print_simple_dbg_gfa(ug->g, "ug0"); + // } + ///b->S.a[0] is the sink of this bubble for (i = 0; i < b->b.n; i++) { v = b->b.a[i]; @@ -26365,32 +26627,40 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, v = v0; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 1; + for (k = 0; k < an; k++) { + av[k].del = 1; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 1); + } fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; v = v1^1; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 1; + for (k = 0; k < an; k++) { + av[k].del = 1; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 1); + } fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; for (i = 0; i < bc->n; i++) { v = bc->a[i]; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 1; + for (k = 0; k < an; k++) { + // av[k].del = 1; + av[k].del = 1; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 1); + } v ^= 1; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 1; + for (k = 0; k < an; k++) { + // av[k].del = 1; + av[k].del = 1; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 1); + } fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; } - - for (i = 0; i < bc->n; i++) { v = bc->a[i]; za = asg_arc_a(ug->g, v); zn = asg_arc_n(ug->g, v); @@ -26409,6 +26679,7 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, // if((av[k].v) == (v>>1)) continue;///looks like a bug if((av[k].v>>1) == (v>>1)) continue; av[k].del = 0; + ra = asg_arc_a(fg->g, (av[k].v^1)); rn = asg_arc_n(fg->g, (av[k].v^1)); for (ri = 0; ri < rn; ri++) { if(ra[ri].v == ((av[k].ul>>32)^1)) { @@ -26427,6 +26698,14 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, is_update = rd_hamming_symm_simple0(b, ug->g, fg->g, v0, v1^1, max_dist, &max_dist); + + // if(((v0>>1) == 7536) && ((v1>>1) == 99223)) { + // fprintf(stderr, "v0::utg%.6lul(%c)\tv1::utg%.6lul(%c)\n", (v0>>1) + 1, "+-"[(v0&1)], (v1>>1) + 1, "+-"[(v1&1)]); + // for (i = 0; i < bc->n; i++) fprintf(stderr, "w::utg%.6ul(%c)\n", (bc->a[i]>>1) + 1, "+-"[(bc->a[i]&1)]); + // print_simple_dbg_gfa(ug->g, "ug1"); + // print_simple_dbg_gfa(fg->g, "fg0"); + // } + // fprintf(stderr, "[M::%s] is_update::%lu\n", __func__, is_update); if(is_update) { for (i = 0; i < bc->n; i++) { @@ -26437,6 +26716,7 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, append_node_arcs(fg->g, ug->g, s, se, v1^1); is_pop = bub_pop_merge(ug, fg, v0, v1, max_dist, b, positive_flag, negative_flag, cov, is_update_chain, o, pf->ae, pf->src, pf->cov, pf->max_hang, pf->min_ovlp, pf->gap_fuzz, &(pf->n_insert)); } else { + max_dist = max_dist0; is_pop = asg_bub_pop1_primary_trio(ug->g, ug, v0, max_dist, b, positive_flag, negative_flag, 1, NULL, NULL, cov, is_update_chain, 0, o); } @@ -26447,14 +26727,18 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, v = v0; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 0; + for (k = 0; k < an; k++) { + av[k].del = 0; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 0); + } fg->g->seq[v>>1].c = PRIMARY_LABLE; v = v1^1; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 0; + for (k = 0; k < an; k++) { + av[k].del = 0; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 0); + } fg->g->seq[v>>1].c = PRIMARY_LABLE; @@ -26462,12 +26746,16 @@ uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, v = bc->a[i]; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 0; + for (k = 0; k < an; k++) { + av[k].del = 0; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 0); + } v ^= 1; av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); - for (k = 0; k < an; k++) av[k].del = 0; + for (k = 0; k < an; k++) { + av[k].del = 0; asg_arc_del(fg->g, av[k].v^1, (av[k].ul>>32)^1, 0); + } fg->g->seq[v>>1].c = PRIMARY_LABLE; } @@ -26478,6 +26766,10 @@ uint64_t refine_bubble_popping(ma_ug_t *ug, buf_t *b, uint32_t v0, uint64_t max_ { // fprintf(stderr, "[M::%s]\n", __func__); if(!asg_bub_pop1_primary_trio(ug->g, ug, v0, max_dist, b, positive_flag, negative_flag, 0, NULL, NULL, NULL, 0, 0, NULL)) return 0; + // if(((v0>>1) == 7536) && ((b->S.a[0]>>1) == 99223)) { + // fprintf(stderr, "[M::%s] uga_v::%u\n", __func__, v0); dbg_asys_gfa(ug->g); + // print_simple_dbg_gfa(ug->g, "uga"); + // } uint32_t non_positive_flag = (uint32_t)-1, v, u, k, rId, pn, npn; if(positive_flag == FATHER) non_positive_flag = MOTHER; if(positive_flag == MOTHER) non_positive_flag = FATHER; @@ -30858,6 +31150,7 @@ void merge_ug_nodes(ma_ug_t *ug, asg_t* read_g, kvec_t_u64_warp* array) ug->u.a[beg_uid>>1] = result; ug->g->seq[beg_uid>>1].del = 0; + ug->g->seq[beg_uid>>1].len = result.len; uint32_t oLen = 0; for (; i < array->a.n; i += 2) @@ -31120,7 +31413,35 @@ void unroll_simple_case_advance(ma_ug_t *ug, asg_t* read_g, ma_hit_t_alloc* reve kv_destroy(u_vecs.a); } +void dbg_spec_edge(asg_t *g, uint32_t s, uint32_t e) +{ + asg_arc_t *av; uint32_t an, k, v, w; + + if(s >= g->n_seq) return; + + v = s<<1; + av = asg_arc_a(g, v); an = asg_arc_n(g, v); + for (k = 0; k < an; k++) { + if(av[k].del) continue; + if((av[k].v>>1) == e) { + w = av[k].v; + fprintf(stderr, "[M::%s]\tL\tutg%.6dl\t%c\tutg%.6dl\t%c\t%dM\tL1:i:%d\tL2:i:%u\n", __func__, + (v>>1)+1, "+-"[v&1], (w>>1)+1, "+-"[w&1], av[k].ol, asg_arc_len(av[k]), 0/**au[j].ou**/); + } + } + v = (s<<1)+1; + av = asg_arc_a(g, v); an = asg_arc_n(g, v); + for (k = 0; k < an; k++) { + if(av[k].del) continue; + if((av[k].v>>1) == e) { + w = av[k].v; + fprintf(stderr, "[M::%s]\tL\tutg%.6dl\t%c\tutg%.6dl\t%c\t%dM\tL1:i:%d\tL2:i:%u\n", __func__, + (v>>1)+1, "+-"[v&1], (w>>1)+1, "+-"[w&1], av[k].ol, asg_arc_len(av[k]), 0/**au[j].ou**/); + } + } + +} void adjust_utg_advance(asg_t *sg, ma_ug_t *ug, ma_hit_t_alloc* reverse_sources, R_to_U* ruIndex, bub_label_t* b_mask_t, uint8_t* is_r_het) { @@ -31130,6 +31451,7 @@ void adjust_utg_advance(asg_t *sg, ma_ug_t *ug, ma_hit_t_alloc* reverse_sources, drop_semi_circle(ug, ug->g, sg, reverse_sources, ruIndex, is_r_het); asg_cleanup(nsg); asg_symm(nsg); + // fprintf(stderr, "[M::%s]-4-\n", __func__); dbg_spec_edge(ug->g, 7536, 28129); ///debug_utg_graph(ug, sg, 0, 0); if(VERBOSE >= 1) { @@ -32134,6 +32456,11 @@ uint32_t collect_p_trans, uint32_t collect_p_trans_f) } + if(asm_opt.max_contig_tip > 0) { + discard_small_ctg(ug, read_g, new_rtg_edges, asm_opt.max_contig_tip, 0, 1); + } + + n_vtx = read_g->n_seq; for (v = 0; v < n_vtx; v++) { @@ -38163,22 +38490,22 @@ char *get_outfile_name(char* output_file_name) void gen_ug_opt_t(ug_opt_t *opt, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, int64_t max_hang, int64_t min_ovlp, int64_t gap_fuzz, int64_t min_dp, uint64_t* readLen, ma_sub_t *coverage_cut, R_to_U* ruIndex, long long tipsLen, -float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, bub_label_t* b_mask_t) +float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, bub_label_t* b_mask_t, telo_end_t *te) { memset(opt, 0, sizeof((*opt))); opt->sources = sources; opt->reverse_sources = reverse_sources; opt->max_hang = max_hang; opt->min_ovlp = min_ovlp; opt->gap_fuzz = gap_fuzz; opt->min_dp = min_dp; opt->readLen = readLen; opt->coverage_cut = coverage_cut; opt->ruIndex = ruIndex; opt->tipsLen = tipsLen; opt->tip_drop_ratio = tip_drop_ratio; opt->stops_threshold = stops_threshold; - opt->chimeric_rate = chimeric_rate; opt->drop_ratio = drop_ratio; opt->b_mask_t = b_mask_t; + opt->chimeric_rate = chimeric_rate; opt->drop_ratio = drop_ratio; opt->b_mask_t = b_mask_t; opt->te = te; } void create_ul_info(ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, int64_t max_hang, int64_t min_ovlp, int64_t gap_fuzz, -int64_t min_dp, uint64_t* readLen, ma_sub_t *coverage_cut, R_to_U* ruIndex, long long tipsLen, float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, bub_label_t* b_mask_t) +int64_t min_dp, uint64_t* readLen, ma_sub_t *coverage_cut, R_to_U* ruIndex, long long tipsLen, float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, bub_label_t* b_mask_t, telo_end_t *te) { ug_opt_t opt; gen_ug_opt_t(&opt, sources, reverse_sources, max_hang, min_ovlp, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, - tipsLen, tip_drop_ratio, stops_threshold, chimeric_rate, drop_ratio, b_mask_t); + tipsLen, tip_drop_ratio, stops_threshold, chimeric_rate, drop_ratio, b_mask_t, te); ul_load(&opt); } @@ -38202,7 +38529,7 @@ void prt_dbg_gfa(asg_t *sg, const char *suffix, ma_sub_t *cov, ma_hit_t_alloc* s } asg_t *gen_init_sg(int32_t min_dp, uint64_t n_read, int64_t mini_overlap_length, int64_t max_hang_length, int64_t gap_fuzz, -ma_hit_t_alloc* src, uint64_t* readLen, R_to_U* ruIndex, bub_label_t *b_mask_t, ma_sub_t** cov, all_ul_t *ul) +ma_hit_t_alloc* src, uint64_t* readLen, R_to_U* ruIndex, bub_label_t *b_mask_t, ma_sub_t** cov, all_ul_t *ul, telo_end_t *te) { asg_t *sg = NULL; // prt_specific_overlap(src, 22233, 22235, "1"); @@ -38221,7 +38548,7 @@ ma_hit_t_alloc* src, uint64_t* readLen, R_to_U* ruIndex, bub_label_t *b_mask_t, ug_opt_t uopt; sg = ma_sg_gen_ul(src, n_read, *cov, ruIndex, max_hang_length, mini_overlap_length, UL_COV_THRES); if(asm_opt.prt_dbg_gfa) prt_dbg_gfa(sg, "raw", *cov, src, ruIndex, max_hang_length, mini_overlap_length); - gen_ug_opt_t(&uopt, src, NULL, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, *cov, ruIndex, -1, -1, -1, -1, -1, b_mask_t); + gen_ug_opt_t(&uopt, src, NULL, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, *cov, ruIndex, -1, -1, -1, -1, -1, b_mask_t, te); ///debug // asg_symm(sg); @@ -38268,14 +38595,14 @@ void gradually_renew_g(ma_hit_t_alloc **src, ma_hit_t_alloc **rev_src, long long uint64_t **readLen, ma_sub_t **cov, R_to_U *ruIndex, asg_t **sg, int64_t mini_overlap_length, int64_t max_hang_length, ug_opt_t *uopt, int64_t clean_round, double min_ovlp_drop_ratio, double max_ovlp_drop_ratio, int64_t max_tip, int64_t gap_fuzz, int64_t min_dp, -bub_label_t *b_mask_t, uint32_t is_trio, int32_t ul_aln_round, char *o_file, const char *bin_file) +bub_label_t *b_mask_t, uint32_t is_trio, int32_t ul_aln_round, char *o_file, const char *bin_file, telo_end_t *te) { int32_t k, strl = strlen(bin_file)+1, kt, cl, sl; char *id = NULL; renew_g(src, rev_src, n_read, readLen, cov, ruIndex, sg, mini_overlap_length, max_hang_length, uopt, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, asm_opt.max_short_tip, b_mask_t, is_trio, o_file, bin_file, (ul_aln_round<=1)?1:0, 1, /**((is_trio)?(0):(1))**/((asm_opt.polyploidy<=2)?1:0)); gen_ug_opt_t(uopt, *src, *rev_src, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, *readLen, - *cov, ruIndex, (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, b_mask_t); + *cov, ruIndex, (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, b_mask_t, te); ug_ext_gfa(uopt, *sg, ug_ext_len); /**if(!ha_opt_triobin(&asm_opt))**/ hic_clean_adv(*sg, uopt); @@ -38292,7 +38619,7 @@ bub_label_t *b_mask_t, uint32_t is_trio, int32_t ul_aln_round, char *o_file, con uopt, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, asm_opt.max_short_tip, b_mask_t, is_trio, o_file, id, ((k+1)==ul_aln_round)?1:0, 0, 0); gen_ug_opt_t(uopt, *src, *rev_src, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, *readLen, - *cov, ruIndex, (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, b_mask_t); + *cov, ruIndex, (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, b_mask_t, te); ug_ext_gfa(uopt, *sg, ug_ext_len); /**if(!ha_opt_triobin(&asm_opt))**/ hic_clean_adv(*sg, uopt); } @@ -38312,6 +38639,9 @@ ma_sub_t **coverage_cut_ptr, int debug_g) asg_t *sg = *sg_ptr; bub_label_t b_mask_t; ug_opt_t uopt; + telo_end_t *te = NULL; + + if(asm_opt.telo_motif) te = gen_telo_end_t(&R_INF, asm_opt.telo_motif, asm_opt.telo_mic_sc, asm_opt.telo_pen, asm_opt.telo_drop, asm_opt.thread_num); if(debug_g) { @@ -38347,7 +38677,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) // prt_specific_overlap(sources, 22235, 22233, "0-a"); if(asm_opt.ar) { create_ul_info(sources, reverse_sources, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, - (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); + (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t, te); } // prt_specific_overlap(sources, 22233, 22235, "0-b"); // prt_specific_overlap(sources, 22235, 22233, "0-b"); @@ -38355,7 +38685,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) // prt_specific_overlap(sources, 22233, 22235, "0-c"); // prt_specific_overlap(sources, 22235, 22233, "0-c"); sg = gen_init_sg(min_dp, n_read, mini_overlap_length, max_hang_length, gap_fuzz, sources, readLen, ruIndex, - &b_mask_t, &coverage_cut, asm_opt.ar?&UL_INF:NULL); + &b_mask_t, &coverage_cut, asm_opt.ar?&UL_INF:NULL, te); // if(asm_opt.ar) exit(1); /** ///print_binned_reads(sources, n_read, coverage_cut); @@ -38391,7 +38721,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) // } gen_ug_opt_t(&uopt, sources, reverse_sources, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, - (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); + (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t, te); ul_clean_gfa(&uopt, sg, sources, reverse_sources, ruIndex, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, 0.6, asm_opt.max_short_tip, gap_fuzz, &b_mask_t, !!asm_opt.ar, ha_opt_triobin(&asm_opt), UL_COV_THRES, o_file); ///@brief debug @@ -38399,7 +38729,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) write_debug_graph(sg, sources, coverage_cut, output_file_name, reverse_sources, ruIndex, &UL_INF); debug_gfa:; gen_ug_opt_t(&uopt, sources, reverse_sources, max_hang_length, mini_overlap_length, gap_fuzz, min_dp, readLen, coverage_cut, ruIndex, - (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t); + (asm_opt.max_short_tip*2), 0.15, 3, 0.05, 0.9, &b_mask_t, te); // set_hom_global_coverage(&asm_opt, sg, coverage_cut, sources, reverse_sources, ruIndex, max_hang_length, mini_overlap_length); } @@ -38407,7 +38737,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) gradually_renew_g(&sources, &reverse_sources, &n_read, &readLen, &coverage_cut, ruIndex, &sg, mini_overlap_length, max_hang_length, &uopt, clean_round, min_ovlp_drop_ratio, max_ovlp_drop_ratio, asm_opt.max_short_tip, gap_fuzz, min_dp, &b_mask_t, - ha_opt_triobin(&asm_opt), asm_opt.ul_clean_round, o_file, "re"); + ha_opt_triobin(&asm_opt), asm_opt.ul_clean_round, o_file, "re", te); } else { ug_ext_gfa(&uopt, sg, ug_ext_len); if(!ha_opt_triobin(&asm_opt)) { @@ -38630,7 +38960,10 @@ ma_sub_t **coverage_cut_ptr, int debug_g) *coverage_cut_ptr = coverage_cut; *sg_ptr = sg; - destory_bub_label_t(&b_mask_t); + destory_bub_label_t(&b_mask_t); + if(te) { + destory_telo_end_t(te); free(te); te = NULL; + } free(o_file); ///if(asm_opt.ar) destory_all_ul_t(&UL_INF); fprintf(stderr, "Inconsistency threshold for low-quality regions in BED files: %u%%\n", asm_opt.bed_inconsist_rate); } diff --git a/Overlaps.h b/Overlaps.h index 899bfe1..6a866d5 100644 --- a/Overlaps.h +++ b/Overlaps.h @@ -85,6 +85,12 @@ typedef struct { uint32_t n; } idx_emask_t; +typedef struct { + uint64_t n, mask; + uint8_t *hh; + uint64_t tlen, tm; +} telo_end_t; + typedef struct { ///off: start idx in mg128_t * a[]; ///cnt: how many eles in this chain @@ -1115,6 +1121,7 @@ typedef struct{ int64_t min_dp; bub_label_t* b_mask_t; uint64_t* readLen; + telo_end_t *te; }ug_opt_t; typedef struct{ @@ -1130,7 +1137,6 @@ typedef struct{ int64_t mini_ovlp; }ul_renew_t; - void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_rate, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, long long tipsLen, float tip_drop_ratio, long long stops_threshold, diff --git a/gfa_ut.cpp b/gfa_ut.cpp index dbc04a2..56aefae 100644 --- a/gfa_ut.cpp +++ b/gfa_ut.cpp @@ -62,7 +62,7 @@ typedef struct { uint32_t len; usg_arc_warp arc[2]; usg_arc_mm_warp arc_mm[2]; - uint8_t del; + uint8_t del, telo; } usg_seq_t; #define usg_arc_key(p) ((p).v) @@ -164,6 +164,7 @@ typedef struct { ul_bg_t bg; asg64_v *iug_tra; uinfo_srt_warp_t *iug_seq; uint64_t iug_cov_thre; + uint8_t *telo; } ul2ul_idx_t; #define ul2ul_srt_key(p) ((p).hid) @@ -550,32 +551,36 @@ static inline int asg_end(const asg_t *g, uint32_t v, uint64_t *lw, uint32_t *ou return ASG_ET_MERGEABLE; } -uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_ou, R_to_U *ru) +uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_ou, R_to_U *ru, telo_end_t *te) { asg64_v tx = {0,0,0}, *b = NULL; - uint32_t n_vtx = g->n_seq<<1, v, w, i, k, cnt = 0, nv, kv, pb, ou, mm_ou, rr, is_u; + uint32_t n_vtx = g->n_seq<<1, v, w, i, k, cnt = 0, nv, kv, pb, ou, mm_ou, rr, is_u, is_telo; asg_arc_t *av = NULL; uint64_t lw; if(in) b = in; else b = &tx; b->n = 0; for (v = 0; v < n_vtx; ++v) { if (g->seq[v>>1].del) continue; + if(te && te->hh[v>>1]) continue; av = asg_arc_a(g, v^1); nv = asg_arc_n(g, v^1); for (i = kv = 0; i < nv; i++) { if (av[i].del) continue; kv++; break; } - if(kv) continue; - kv = 1; mm_ou = (uint32_t)-1; ou = 0; + + kv = 1; mm_ou = (uint32_t)-1; ou = 0; is_telo = 0; + if(te && te->hh[v>>1]) is_telo = 1; for (i = 0, w = v; i < max_ext; i++) { if(asg_end(g, w^1, &lw, is_ou?&ou:NULL)!=0) break; w = (uint32_t)lw; kv++; mm_ou = MIN(mm_ou, ou); + if(te && te->hh[w>>1]) is_telo = 1; } + if(mm_ou == (uint32_t)-1) mm_ou = 0; kv += mm_ou; i += mm_ou; - if(i < max_ext/** + (!!is_ou)**/) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); + if((i < max_ext/** + (!!is_ou)**/) && (!is_telo)) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); } radix_sort_srt64(b->a, b->a + b->n); @@ -584,6 +589,7 @@ uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_o v = (uint32_t)(b->a[k]); if (g->seq[v>>1].del) continue; + if(te && te->hh[v>>1]) continue; av = asg_arc_a(g, v^1); nv = asg_arc_n(g, v^1); for (i = kv = 0; i < nv; i++) { @@ -592,15 +598,18 @@ uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_o } if(kv) continue; - pb = b->n; kv_push(uint64_t, *b, v); mm_ou = (uint32_t)-1; ou = 0; + + pb = b->n; kv_push(uint64_t, *b, v); mm_ou = (uint32_t)-1; ou = 0; is_telo = 0; + if(te && te->hh[v>>1]) is_telo = 1; for (i = 0, w = v; i < max_ext; i++) { if(asg_end(g, w^1, &lw, is_ou?&ou:NULL)!=0) break; w = (uint32_t)lw; kv_push(uint64_t, *b, lw); mm_ou = MIN(mm_ou, ou); + if(te && te->hh[w>>1]) is_telo = 1; } if(mm_ou == (uint32_t)-1) mm_ou = 0; i += mm_ou; - if(i < max_ext/** + (!!is_ou)**/) { + if((i < max_ext/** + (!!is_ou)**/) && (!is_telo)) { for (i = pb; i < b->n; i++) asg_seq_del(g, ((uint32_t)b->a[i])>>1); cnt++; } @@ -620,14 +629,17 @@ uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_o get_R_to_U(ru, v>>1, &rr, &is_u); if(rr == (uint32_t)-1 || is_u == 1) continue; + if(te && te->hh[v>>1]) continue; kv_push(uint64_t, *b, v); for (i = 0, w = v; i < max_ext; i++) { if(asg_end(g, w^1, &lw, NULL)!=0) break; w = (uint32_t)lw; - get_R_to_U(ru, lw>>1, &rr, &is_u); + get_R_to_U(ru, w>>1, &rr, &is_u); if(rr == (uint32_t)-1 || is_u == 1) break; + if(te && te->hh[w>>1]) break; + kv_push(uint64_t, *b, lw); } @@ -902,7 +914,7 @@ int32_t if_sup_chimeric(ma_hit_t_alloc* src, uint64_t rLen, asg64_v *b, int if_e } ///remove single node -void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t ou_thres) +void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t ou_thres, telo_end_t *te) { asg64_v tx = {0,0,0}, *b = NULL; uint32_t v, w, ei[2] = {0}, k, i, n_vtx = g->n_seq<<1; @@ -913,6 +925,7 @@ void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t o for (v = 0; v < n_vtx; ++v) { if (g->seq[v>>1].del) continue; + if (te && te->hh[v>>1]) continue; if(g->seq_vis[v] == 0) { if((get_arcs(g, v, &(ei[0]), 1)!=1) || (get_arcs(g, v^1, &(ei[1]), 1)!=1)) continue; assert((g->arc[ei[0]].ul>>32) == v && (g->arc[ei[1]].ul>>32) == (v^1)); @@ -940,6 +953,7 @@ void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t o } if(!el_n) continue; + if(te && te->hh[v>>1]) continue; asg_seq_del(g, v>>1); cnt++; } @@ -2293,7 +2307,7 @@ uint32_t asg_cut_semi_circ(asg_t *g, uint32_t lim_len, uint32_t is_clean) return cnt; } -uint32_t asg_cut_chimeric_bub(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t is_clean) +uint32_t asg_cut_chimeric_bub(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t is_clean, telo_end_t *te) { asg64_v tx = {0,0,0}, *b = NULL; uint32_t v, w, nw, k, n_vtx = g->n_seq<<1, ei[2] = {0}, e, ss, cnt = 0; @@ -2304,6 +2318,7 @@ uint32_t asg_cut_chimeric_bub(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32 // fprintf(stderr, "[M::%s]\n", __func__); for (v = 0; v < n_vtx; ++v) { if (g->seq[v>>1].del) continue; + if(te && te->hh[v>>1]) continue; ///note: ei[0] and ei[1] are the edge idx if((get_arcs(g, v, &(ei[0]), 1)!=1) || (get_arcs(g, v^1, &(ei[1]), 1)!=1)) continue; assert((g->arc[ei[0]].ul>>32) == v && (g->arc[ei[1]].ul>>32) == (v^1)); @@ -2337,12 +2352,12 @@ uint32_t asg_cut_chimeric_bub(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32 return cnt; } -void asg_iterative_semi_circ(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t pop_chimer) +void asg_iterative_semi_circ(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t pop_chimer, telo_end_t *te) { uint64_t occ = 0, s = 1; while (s) { s = asg_cut_semi_circ(g, LIM_LEN, 0); - if(pop_chimer) s = s + asg_cut_chimeric_bub(g, src, in, normal_len, 0); + if(pop_chimer) s = s + asg_cut_chimeric_bub(g, src, in, normal_len, 0, te); occ += s; } @@ -2771,7 +2786,7 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i // debug_info_of_specfic_node("m64011_190830_220126/47516220/ccs", sg, rI, "beg-0"); // debug_info_of_specfic_node("m64012_190920_173625/163644465/ccs", sg, rI, "beg-0"); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te);///p_telo // fprintf(stderr, "[M::%s] count_edges_v_w(sg, 49778, 49847)->%ld\n", __func__, count_edges_v_w(sg, 49778, 49847)); // if(is_ou) dedup_contain_g(uopt, sg); for (i = 0; i < clean_round; i++, drop += step) { @@ -2785,21 +2800,21 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i // print_vw_edge(sg, 34156, 34090, "0"); // stats_chimeric(sg, src, &bu); - if(!is_ou) asg_iterative_semi_circ(sg, src, &bu, max_tip, 1); + if(!is_ou) asg_iterative_semi_circ(sg, src, &bu, max_tip, 1, uopt->te);///p_telo asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 1); - asg_arc_cut_chimeric(sg, src, &bu, is_ou?ou_thres:(uint32_t)-1); + asg_arc_cut_chimeric(sg, src, &bu, is_ou?ou_thres:(uint32_t)-1, uopt->te);///p_telo - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); // prt_specfic_sge(sg, 10531, 10519, "--1--"); asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_arc_cut_inexact(sg, src, &bu, max_tip, is_ou, is_trio, min_diff, ou_drop_rate/**, NULL**//**&dbg**/); // debug_edges(&dbg, d, 2); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); // prt_specfic_sge(sg, 10531, 10519, "--2--"); asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 1); asg_arc_cut_length(sg, &bu, max_tip, drop, ou_drop_rate, is_ou, is_trio, 1, min_diff, 1, NULL, NULL, NULL); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); // prt_specfic_sge(sg, 10531, 10519, "--3--"); // if(is_ou) asg_arc_cut_contain(fg, &bu, &ba, rI, ((i+1)te); // prt_specfic_sge(sg, 10531, 10519, "--5--"); // if(i == 3) { @@ -2838,13 +2853,13 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i // dedup_contain_g(uopt, sg); if(clean_contain_g(uopt, sg, 1)) update_sg_uo(sg, src); } - if(!is_ou) asg_iterative_semi_circ(sg, src, &bu, max_tip, 1); + if(!is_ou) asg_iterative_semi_circ(sg, src, &bu, max_tip, 1, uopt->te); // print_debug_gfa(sg, NULL, uopt->coverage_cut, "UL.dirty5.debug", uopt->sources, uopt->ruIndex, uopt->max_hang, uopt->min_ovlp, 1, 0, 0); asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_cut_large_indel(sg, &bu, max_tip, HARD_OU_DROP, is_ou, min_diff);///shoule we ignore ou here? - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); // print_debug_gfa(sg, NULL, uopt->coverage_cut, "UL.dirty6.debug", uopt->sources, uopt->ruIndex, uopt->max_hang, uopt->min_ovlp, 1, 0, 0); @@ -2852,18 +2867,18 @@ double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, i ///asg_arc_del_triangular_directly might be unnecessary asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_arc_cut_length(sg, &bu, max_tip, HARD_ORTHOLOGY_DROP/**min_ovlp_drop_ratio**/, ou_drop_rate, is_ou, 0/**is_trio**/, is_ou?1:0, min_diff, 1, rev, rI, NULL); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); // print_debug_gfa(sg, NULL, uopt->coverage_cut, "UL.dirty7.debug", uopt->sources, uopt->ruIndex, uopt->max_hang, uopt->min_ovlp, 1, 0, 0); asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_arc_cut_length(sg, &bu, max_tip, min_ovlp_drop_ratio, ou_drop_rate, is_ou, 0/**is_trio**/, is_ou?1:0, min_diff, 1, rev, rI, &l_drop); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); } else { min_diff = step_diff; l_drop = 6000; asg_arc_identify_simple_bubbles_multi(sg, b_mask_t, 0); asg_arc_cut_length(sg, &bu, max_tip, 0.3, 0.9, is_ou, is_trio, 1, min_diff, 8, NULL, NULL, &l_drop); - asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL); + asg_arc_cut_tips(sg, max_tip, &bu, is_ou, is_ou?rI:NULL, uopt->te); } // print_debug_gfa(sg, NULL, uopt->coverage_cut, "UL.dirty8.debug", uopt->sources, uopt->ruIndex, uopt->max_hang, uopt->min_ovlp, 1, 0, 0); @@ -8370,7 +8385,7 @@ static inline void ulg_seq_del(ma_ug_t *ug, uint32_t s) uint32_t ulg_arc_cut_tips(ul_resolve_t *uidx, ma_ug_t *ug, uint32_t max_ext, uint32_t max_ext_hifi, uint32_t is_double_check, asg64_v *in, asg64_v *ib) { asg64_v tx = {0,0,0}, tb = {0,0,0}, *b = NULL, *ub = NULL; asg_t *g = ug->g; - uint32_t n_vtx = g->n_seq<<1, v, w, i, k, cnt = 0, nv, kv, pb, w_occ, a_occ, ul_occ, del_occ; + uint32_t n_vtx = g->n_seq<<1, v, w, i, k, cnt = 0, nv, kv, pb, w_occ, a_occ, ul_occ, del_occ, is_telo; asg_arc_t *av = NULL; uint64_t lw; b = (in?(in):(&tx)); ub = (ib?(ib):(&tb)); for (v = 0, b->n = 0; v < n_vtx; ++v) { @@ -8385,14 +8400,16 @@ uint32_t ulg_arc_cut_tips(ul_resolve_t *uidx, ma_ug_t *ug, uint32_t max_ext, uin if(kv) continue; // kv = ug->u.a[v>>1].n;/// get_ul_occ(uidx, v>>1); get_iug_u_raw_occ(uidx, v>>1, &ul_occ, NULL); kv = ul_occ; + is_telo = 0; if(uidx->uovl.telo && uidx->uovl.telo[v>>1]) is_telo = 1; for (i = 0, w = v; i < max_ext; i++) { if(asg_end(g, w^1, &lw, NULL)!=0) break; w = (uint32_t)lw; // kv += ug->u.a[w>>1].n; ///get_ul_occ(uidx, w>>1); get_iug_u_raw_occ(uidx, w>>1, &ul_occ, NULL); kv += ul_occ; + if(uidx->uovl.telo && uidx->uovl.telo[w>>1]) is_telo = 1; } - if(kv <= max_ext) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); + if((kv <= max_ext) && (!is_telo)) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); } radix_sort_srt64(b->a, b->a + b->n); @@ -8412,16 +8429,18 @@ uint32_t ulg_arc_cut_tips(ul_resolve_t *uidx, ma_ug_t *ug, uint32_t max_ext, uin pb = b->n; kv_push(uint64_t, *b, v); // kv = ug->u.a[v>>1].n;/// get_ul_occ(uidx, v>>1); get_iug_u_raw_occ(uidx, v>>1, &ul_occ, NULL); kv = ul_occ; + is_telo = 0; if(uidx->uovl.telo && uidx->uovl.telo[v>>1]) is_telo = 1; for (i = 0, w = v; i < max_ext; i++) { if(asg_end(g, w^1, &lw, NULL)!=0) break; w = (uint32_t)lw; kv_push(uint64_t, *b, w); // kv += ug->u.a[w>>1].n; ///get_ul_occ(uidx, w>>1); get_iug_u_raw_occ(uidx, w>>1, &ul_occ, NULL); kv += ul_occ; + if(uidx->uovl.telo && uidx->uovl.telo[w>>1]) is_telo = 1; } - if(kv <= max_ext && get_remove_hifi_occ(uidx, max_ext_hifi, b->a + pb, b->n - pb, ub, &del_occ)) { + if((!is_telo) && (kv <= max_ext) && (get_remove_hifi_occ(uidx, max_ext_hifi, b->a + pb, b->n - pb, ub, &del_occ))) { // fprintf(stderr, "*[M::%s::] k::%u, v>>1::%u, v&1::%u, kv::%u, max_ext::%u, max_ext_hifi::%u\n\n", // __func__, k, v>>1, v&1, kv, max_ext, max_ext_hifi); if(is_double_check) get_bridges(uidx, b->a + pb, b->n - pb, &w_occ, &a_occ); @@ -10410,7 +10429,7 @@ uint32_t usg_real_tip(usg_t *g, uint32_t v0, double rate) uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_v *in) { asg64_v tx = {0,0,0}, *b = NULL; - uint32_t n_vtx = g->n<<1, v, w, i, k, cnt = 0, nv, kv, pb, ff; + uint32_t n_vtx = g->n<<1, v, w, i, k, cnt = 0, nv, kv, pb, ff, is_telo; usg_arc_t *av = NULL, *p; uint64_t lw; if(in) b = in; else b = &tx; @@ -10425,11 +10444,14 @@ uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_ } if(kv) continue; + + is_telo = 0; if(g->a[v>>1].telo) is_telo = 1; for (i = 0, w = v, kv = g->a[v>>1].occ; i < max_ext; i++) { if(usg_end(g, w^1, &lw)!=0) break; w = (uint32_t)lw; kv += g->a[w>>1].occ; + if(g->a[w>>1].telo) is_telo = 1; } - if(kv <= max_ext) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); + if((kv <= max_ext) && (!is_telo)) kv_push(uint64_t, *b, (((uint64_t)kv)<<32)|v); } radix_sort_srt64(b->a, b->a + b->n); @@ -10445,9 +10467,11 @@ uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_ if(kv) continue; pb = b->n; kv_push(uint64_t, *b, v); + is_telo = 0; if(g->a[v>>1].telo) is_telo = 1; for (i = 0, w = v, kv = g->a[v>>1].occ; i < max_ext; i++) { if(usg_end(g, w^1, &lw)!=0) break; w = (uint32_t)lw; kv += g->a[w>>1].occ; kv_push(uint64_t, *b, lw); + if(g->a[w>>1].telo) is_telo = 1; } // if((v>>1) == 308) { // fprintf(stderr, "[M::%s::] v>>1::%u, v&1::%u, kv::%u\n", @@ -10455,7 +10479,7 @@ uint32_t usg_arc_cut_tips(usg_t *g, uint32_t max_ext, uint32_t ignore_ul, asg64_ // } - if(kv <= max_ext) { + if((kv <= max_ext) && (!is_telo)) { ff = 0; if(!ignore_ul) {///consider UL for (i = pb; i + 1 < b->n; i++) { @@ -11368,7 +11392,7 @@ void update_dual_junction(usg_t *g, uint32_t v, uint32_t v_id, uint32_t w, uint3 for (k = 0; k < buf->n; k++) { nid = buf->a[k]>>1; kv_push(uint32_t, g->mp.a[g->a[nid].mm], g->n); s = push_usg_t_node(g, g->n); - s->mm = g->a[nid].mm; s->occ = g->a[nid].occ; s->len = g->a[nid].len; s->del = 0; + s->mm = g->a[nid].mm; s->occ = g->a[nid].occ; s->len = g->a[nid].len; s->telo = g->a[nid].telo; s->del = 0; s->arc[0].n = s->arc[1].n = 0; s->arc_mm[0].n = s->arc_mm[1].n = 0; } @@ -11936,7 +11960,7 @@ uint32_t ava_pass_unique_bridge(uint64_t *idx, uint64_t *integer_seq, uint64_t s uint32_t ava_pass_unique_bridge_tips(usg_t *g, asg64_v *b64, uint64_t g_s, uint64_t g_e, uint64_t *integer_seq, uint8_t *f, uint64_t max_ext, double max_ext_rate, uint64_t ext_up) { - uint64_t i, k, z, s, e, v, nv, bn = b64->n, kv, n_ext = 0, nkeep = 0, ncut = 0; usg_arc_t *av; + uint64_t i, k, z, s, e, v, nv, bn = b64->n, kv, kv_t, n_ext = 0, nkeep = 0, ncut = 0, is_telo; usg_arc_t *av; for (i = g_s; i < g_e; i++) {///available intervals within the same cluster s = b64->a[((uint32_t)b64->a[i])]>>32; e = ((uint32_t)b64->a[((uint32_t)b64->a[i])]); assert(s < e); for (k = s + 1; k < e; k++) {///note: here is [s, e] @@ -11977,10 +12001,10 @@ uint32_t ava_pass_unique_bridge_tips(usg_t *g, asg64_v *b64, uint64_t g_s, uint6 } } - ncut = MAX(nkeep, max_ext); + ncut = MAX(nkeep, max_ext); is_telo = 0; if(ncut > ext_up) ncut = ext_up; if(ncut < max_ext) ncut = max_ext; - while (b64->n > bn && n_ext < ncut) { + while ((b64->n > bn) && (n_ext < ncut) && (!is_telo)) { v = b64->a[--b64->n]; if(f[v]) continue; av = usg_arc_a(g, v^1); nv = usg_arc_n(g, v^1); for (i = kv = 0; i < nv && kv < 1; i++) { @@ -11991,10 +12015,13 @@ uint32_t ava_pass_unique_bridge_tips(usg_t *g, asg64_v *b64, uint64_t g_s, uint6 n_ext += g->a[v>>1].occ; f[v] = f[v^1] = 1; av = usg_arc_a(g, v); nv = usg_arc_n(g, v); - for (i = 0; i < nv; i++) { - if (av[i].del || f[av[i].v^1]) continue; + for (i = kv_t = 0; i < nv; i++) { + if(av[i].del) continue; + kv_t++; + if (f[av[i].v^1]) continue; kv_push(uint64_t, *b64, av[i].v); - } + } + if((!kv_t) && (g->a[v>>1].telo)) is_telo = 1; } ///reset f[] to 0 @@ -12031,6 +12058,7 @@ uint32_t ava_pass_unique_bridge_tips(usg_t *g, asg64_v *b64, uint64_t g_s, uint6 } } if((n_ext < ncut) && (n_ext > max_ext - 1)) n_ext = max_ext - 1; + if(is_telo) n_ext = max_ext + 1; return n_ext; } @@ -12072,7 +12100,7 @@ void update_usg_t_threading_0(ul_resolve_t *uidx, usg_t *ng, uint64_t *a, uint64 nid = a[k]>>1; nnid = ng->n; kv_push(uint32_t, ng->mp.a[ng->a[nid].mm], nnid); s = push_usg_t_node(ng, nnid); - s->mm = ng->a[nid].mm; s->occ = ng->a[nid].occ; s->len = ng->a[nid].len; s->del = 0; + s->mm = ng->a[nid].mm; s->occ = ng->a[nid].occ; s->len = ng->a[nid].len; s->telo = ng->a[nid].telo; s->del = 0; s->arc[0].n = s->arc[1].n = 0; s->arc_mm[0].n = s->arc_mm[1].n = 0; nnid <<= 1; nnid |= a[k]&1; kv_pushp(uint64_t, *b, &p); (*p) = a[k]^1; (*p) <<= 32; (*p) |= nnid^1; @@ -14361,6 +14389,11 @@ uint64_t rid_n, uint64_t scaf_len, char *scaf_id, asg_t *ng, ug_opt_t *uopt, ma_ REALLOC(rdb->trio_flag, rdb->total_reads); REALLOC(rdb->name_index, rdb->total_reads+1);///total_reads+1 REALLOC((*cov), rdb->total_reads); + if(uopt->te) { + assert(uopt->te->n == rid_n0); + uopt->te->n = rdb->total_reads; + REALLOC(uopt->te->hh, rdb->total_reads); + } for (i = rid_n0; i < rdb->total_reads; i++) { // fprintf(stderr, "[M::%s] i::%lu\n", __func__, i); rdb->N_site[i] = NULL; @@ -14383,6 +14416,7 @@ uint64_t rid_n, uint64_t scaf_len, char *scaf_id, asg_t *ng, ug_opt_t *uopt, ma_ rdb->trio_flag[i] = rdb->trio_flag[rmap[i]]; (*cov)[i] = (*cov)[rmap[i]]; cname = Get_NAME_LENGTH((*rdb), (rmap[i])); + if(uopt->te) uopt->te->hh[i] = uopt->te->hh[rmap[i]]; } else { z = &(a[(i-ng->r_seq)]); assert(z->nid == i); cname = scaf_id_len; if(reset_scaf_node_uinfo_srt_t(z, uopt->min_ovlp, scaf_len, &nlen) == 2) { @@ -14392,6 +14426,7 @@ uint64_t rid_n, uint64_t scaf_len, char *scaf_id, asg_t *ng, ug_opt_t *uopt, ma_ rdb->read_size[i] = nlen; (*cov)[i].s = 0; (*cov)[i].e = nlen; ng->seq[i].len = nlen;///update length for the scaffold node + if(uopt->te) uopt->te->hh[i] = 0; } rdb->name_index[i] = tname; tname += cname; rdb->total_reads_bases += rdb->read_length[i]; @@ -15651,7 +15686,7 @@ void u2g_hybrid_clean(ul_resolve_t *uidx, ulg_opt_t *ulopt, usg_t *ng, asg64_v * uint8_t *bs, *f; CALLOC(bs, ng->n<<1); CALLOC(f, ng->n); // fprintf(stderr, "\n[M::%s::] Starting hybrid clean, mm_tip::%ld\n", __func__, mm_tip); // prt_usg_t(uidx, ng, "ng_h0"); - usg_arc_cut_tips(ng, mm_tip, 0, b); + usg_arc_cut_tips(ng, mm_tip, 0, b);///p_telo // prt_usg_t(uidx, ng, "ng_h1"); // char sb[1000]; @@ -15809,6 +15844,15 @@ ma_ug_t *gen_hybrid_ug(ul_resolve_t *uidx, usg_t *ng) return ug; } +uint32_t gen_utg_telo(ma_utg_t *u, telo_end_t *te) +{ + uint32_t i; + for (i = 0; i < u->n; ++i) { + if(te->hh[u->a[i]>>33]) return 1; + } + return 0; +} + void renew_ul2_utg(ul_resolve_t *uidx); void u2g_threading(ul_resolve_t *uidx, ulg_opt_t *ulopt, uint64_t cov_cutoff, uint64_t is_bridg, asg64_v *b, asg64_v *ub) @@ -15825,6 +15869,11 @@ void u2g_threading(ul_resolve_t *uidx, ulg_opt_t *ulopt, uint64_t cov_cutoff, ui s->mm = k; s->arc[0].n = s->arc[1].n = 0; s->occ = raw->u.a[k].n; s->arc_mm[0].n = s->arc_mm[1].n; s->del = raw->g->seq[k].del; s->len = raw->g->seq[k].len; + s->telo = 0; + if(uidx->uopt->te) { + s->telo = gen_utg_telo(&(raw->u.a[k]), uidx->uopt->te); + } + av = asg_arc_a(raw->g, (k<<1)); nv = asg_arc_n(raw->g, (k<<1)); sv = &(s->arc[0]); for (z = 0; z < nv; z++) { if(av[z].del) continue; @@ -15966,7 +16015,7 @@ void u2g_clean(ul_resolve_t *uidx, ulg_opt_t *ulopt, uint32_t keep_raw_utg, uint cnt = 0; asg_arc_identify_simple_bubbles_multi(iug->g, ulopt->b_mask_t, 0); cnt += ulg_arc_cut_supports(uidx, iug, mm_tip, ulopt->max_tip_hifi, drop, ulopt->is_trio, topo_level, 1, NULL, keep_raw_utg, &bu, &uu); - cnt += ulg_arc_cut_tips(uidx, iug, mm_tip, ulopt->max_tip_hifi, 1, &bu, &uu); + cnt += ulg_arc_cut_tips(uidx, iug, mm_tip, ulopt->max_tip_hifi, 1, &bu, &uu);///p_telo } cnt = 1; topo_level = 1; mm_tip = ulopt->max_tip; @@ -15991,7 +16040,7 @@ void u2g_clean(ul_resolve_t *uidx, ulg_opt_t *ulopt, uint32_t keep_raw_utg, uint ulg_pop_bubble(uidx, iug, NULL, ((int64_t)0x7fffffff), ulopt->max_tip_hifi, 1, &bu, &uu); - while(ulg_arc_cut_z(uidx, iug, ((int64_t)0x7fffffff), ulopt->max_tip_hifi, 1.5, 0.15, 100, 0.8, ulopt->is_trio, 1, NULL, &bu, &uu)); + while(ulg_arc_cut_z(uidx, iug, ((int64_t)0x7fffffff), ulopt->max_tip_hifi, 1.5, 0.15, 100, 0.8, ulopt->is_trio, 1, NULL, &bu, &uu));///non_p_telo // sprintf(sb, "ig_ss_%lu_i_%ld", ss, i); // output_integer_graph(uidx, iug, sb, 0); @@ -16220,17 +16269,21 @@ void gen_raw_ug_seq(ul_resolve_t *uidx, ul_str_t *str, ma_utg_t *u, ma_ug_t *raw } -void renew_u2g_cov(ul_resolve_t *uidx) +void renew_u2g_cov(ul_resolve_t *uidx, telo_end_t *te) { ul2ul_idx_t *idx = &(uidx->uovl); uinfo_srt_warp_t *x; ma_ug_t *i_ug = idx->i_ug, *raw = uidx->l1_ug; uint64_t k, z, iug_occ, m, l, *a, a_n; free(idx->cc.uc); free(idx->cc.hc); free(idx->cc.raw_uc); free(idx->cc.iug_idx); free(idx->cc.iug_b); free(idx->cc.iug_a); + if(te) { + free(idx->telo); idx->telo = NULL; + } memset(&(idx->cc), 0, sizeof(idx->cc)); MALLOC(idx->cc.uc, i_ug->u.n); ///number of ul (contained+non-contained) MALLOC(idx->cc.raw_uc, i_ug->u.n); ///number of ul (non-contained) MALLOC(idx->cc.hc, i_ug->u.n); ///number of HiFi reads + if(te) MALLOC(idx->telo, i_ug->u.n); ///is telo kt_for(uidx->str_b.n_thread, worker_renew_u2g_cov, uidx, i_ug->u.n); @@ -16242,6 +16295,14 @@ void renew_u2g_cov(ul_resolve_t *uidx) gen_raw_ug_seq(uidx, uidx->pstr.str.a, &(i_ug->u.a[k]), raw, &(idx->cc.iug_a[k]), k); iug_occ += idx->cc.iug_a[k].n; x = &(idx->cc.iug_a[k]); for (z = 0; z < x->n; z++) idx->cc.iug_idx[x->a[z].v>>1]++; + + if(te) { + idx->telo[k] = 0; + for (z = 0; z < x->n; z++) { + if(gen_utg_telo(&(raw->u.a[x->a[z].v>>1]), te)) break; + } + if(z < x->n) idx->telo[k] = 1; + } } MALLOC(idx->cc.iug_b, iug_occ); ///idx for integer sequences @@ -16470,7 +16531,7 @@ void renew_ul2_utg(ul_resolve_t *uidx) } renew_utg(&(z->i_ug), z->i_g, NULL); free(z->i_ug->g->seq_vis); CALLOC(z->i_ug->g->seq_vis, z->i_ug->g->n_seq*2); - renew_u2g_cov(uidx); + renew_u2g_cov(uidx, uidx->uopt->te); renew_u2g_bg(uidx); } @@ -16515,7 +16576,7 @@ ul2ul_idx_t *gen_ul2ul(ul_resolve_t *uidx, ug_opt_t *uopt, ulg_opt_t *ulopt, uin z->i_ug = ma_ug_gen(z->i_g); // CALLOC(z->i_ug->g->seq_vis, z->i_ug->g->n_seq*2); - renew_u2g_cov(uidx); + renew_u2g_cov(uidx, uidx->uopt->te); renew_u2g_bg(uidx); // output_integer_graph(uidx, z->i_ug, asm_opt.output_file_name); @@ -17188,6 +17249,7 @@ void destroy_ul_resolve_t(ul_resolve_t *uidx) free(uidx->pstr.idx.a); + free(uidx->uovl.telo); for (k = 0; k < uidx->uovl.n; k++) { free(uidx->uovl.a[k].a); } diff --git a/gfa_ut.h b/gfa_ut.h index 8a395df..2a18b9b 100644 --- a/gfa_ut.h +++ b/gfa_ut.h @@ -17,9 +17,9 @@ typedef struct { void ul_clean_gfa(ug_opt_t *uopt, asg_t *sg, ma_hit_t_alloc *src, ma_hit_t_alloc *rev, R_to_U* rI, int64_t clean_round, double min_ovlp_drop_ratio, double max_ovlp_drop_ratio, double ou_drop_rate, int64_t max_tip, int64_t gap_fuzz, bub_label_t *b_mask_t, int32_t is_ou, int32_t is_trio, uint32_t ou_thres, char *o_file); -uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_ou, R_to_U *ru); -void asg_iterative_semi_circ(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t pop_chimer, asg64_v *dbg); -void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t ou_thres); +uint32_t asg_arc_cut_tips(asg_t *g, uint32_t max_ext, asg64_v *in, uint32_t is_ou, R_to_U *ru, telo_end_t *te); +void asg_iterative_semi_circ(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t normal_len, uint32_t pop_chimer, asg64_v *dbg, telo_end_t *te); +void asg_arc_cut_chimeric(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, uint32_t ou_thres, telo_end_t *te); void asg_arc_cut_inexact(asg_t *g, ma_hit_t_alloc* src, asg64_v *in, int32_t max_ext, uint32_t is_ou, uint32_t is_trio, uint32_t min_diff, float ou_rat/**, asg64_v *dbg**/); void asg_arc_cut_length(asg_t *g, asg64_v *in, int32_t max_ext, float len_rat, float ou_rat, uint32_t is_ou, uint32_t is_trio, uint32_t is_topo, uint32_t min_diff, uint32_t min_ou, ma_hit_t_alloc *rev, R_to_U* rI, uint32_t *max_drop_len);