Skip to content

Commit

Permalink
Merge pull request #540 from chhylp123/hifiasm_dev_debug
Browse files Browse the repository at this point in the history
fix bug for gen_contain_consensus_chain
  • Loading branch information
chhylp123 authored Oct 22, 2023
2 parents 40e2a48 + 4ca8951 commit 7f6d36b
Show file tree
Hide file tree
Showing 12 changed files with 281 additions and 54 deletions.
106 changes: 87 additions & 19 deletions Assembly.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,13 +649,6 @@ static void worker_ovec_cal0(void *data, long i, int tid)
{
ha_ovec_buf_t *b = ((ha_ovec_buf_t**)data)[tid];
int fully_cov, abnormal;
// if(i != 33) return;
// fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i);
// if (memcmp("m64012_190920_173625/88015004/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) {
// fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i);
// } else {
// return;
// }

ha_get_candidates_interface(b->ab, i, &b->self_read, &b->olist, &b->olist_hp, &b->clist,
0.02, asm_opt.max_n_chain, 1, NULL/**&(b->k_flag)**/, &b->r_buf, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->tmp_region), NULL, &(b->sp));
Expand All @@ -664,17 +657,24 @@ static void worker_ovec_cal0(void *data, long i, int tid)
clear_Round2_alignment(&b->round2);

correct_overlap(&b->olist, &R_INF, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon,
&b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 0, &fully_cov, &abnormal);
&b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 0/**1***/, &fully_cov, &abnormal);

b->num_read_base += b->self_read.length;
b->num_correct_base += b->correct.corrected_base;
b->num_recorrect_base += b->round2.dumy.corrected_base;



R_INF.paf[i].is_fully_corrected = 0;
R_INF.paf[i].is_abnormal = abnormal;
// R_INF.paf[i].is_fully_corrected = 0;
// R_INF.paf[i].is_abnormal = abnormal;
R_INF.trio_flag[i] = AMBIGU;

// R_INF.paf[i].is_fully_corrected = 0;
// if (fully_cov) {
// if (get_cigar_errors(&b->cigar1) == 0 && get_cigar_errors(&b->round2.cigar) == 0)
// R_INF.paf[i].is_fully_corrected = 1;
// }
// R_INF.paf[i].is_abnormal = abnormal;
// R_INF.trio_flag[i] = AMBIGU;

push_final_overlaps(&(R_INF.paf[i]), R_INF.reverse_paf, &b->olist, 1);
push_final_overlaps(&(R_INF.reverse_paf[i]), R_INF.reverse_paf, &b->olist, 2);
Expand Down Expand Up @@ -920,7 +920,7 @@ void rescue_hp_reads(ha_ovec_buf_t **b)
int hom_cov, het_cov;
ha_flt_tab_hp = ha_idx_hp = NULL;
if (!(asm_opt.flag & HA_F_NO_KMER_FLT)) {
ha_flt_tab_hp = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 1);
ha_flt_tab_hp = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 1, 0);
}
ha_idx_hp = ha_pt_gen(&asm_opt, ha_flt_tab, 1, 1, &R_INF, &hom_cov, &het_cov);

Expand Down Expand Up @@ -1038,7 +1038,7 @@ void ha_overlap_and_correct(int round)
///debug_print_pob_regions();
}

void ha_overlap_cal(int round)
void ha_overlap_cal(int round, int read_from_store)
{
int i, hom_cov, het_cov;
ha_ovec_buf_t **b;
Expand All @@ -1049,9 +1049,9 @@ void ha_overlap_cal(int round)
for (i = 0; i < asm_opt.thread_num; ++i)
b[i] = ha_ovec_init(0, (round == asm_opt.number_of_round - 1),0);
if(ha_idx) hom_cov = asm_opt.hom_cov;
if(ha_idx == NULL) ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, round == 0? 0 : 1, 0, &R_INF, &hom_cov, &het_cov); // build the index
if(ha_idx == NULL) ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, ((round == 0)&&(read_from_store == 0))?0:1, 0, &R_INF, &hom_cov, &het_cov); // build the index
///debug_adapter(&asm_opt, &R_INF);
if (round == 0 && ha_flt_tab == 0) // then asm_opt.hom_cov hasn't been updated
if (/**round == 0 &&**/ ha_flt_tab == 0) // then asm_opt.hom_cov hasn't been updated
ha_opt_update_cov(&asm_opt, hom_cov);
het_cnt = NULL;
if(round == asm_opt.number_of_round-1 && asm_opt.is_dbg_het_cnt) CALLOC(het_cnt, R_INF.total_reads);
Expand All @@ -1075,6 +1075,8 @@ void ha_overlap_cal(int round)
ha_ovec_destroy(b[i]);
}
free(b);
asm_opt.hom_cov = hom_cov;
asm_opt.het_cov = het_cov;
}


Expand Down Expand Up @@ -1787,7 +1789,7 @@ void hap_recalculate_peaks(char* output_file_name)
int hom_cov, het_cov;
// construct hash table for high occurrence k-mers
if (!(asm_opt.flag & HA_F_NO_KMER_FLT)) {
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0);
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0, 0);
ha_opt_update_cov(&asm_opt, hom_cov);
}
free(R_INF.read_length);
Expand Down Expand Up @@ -1898,13 +1900,13 @@ int ha_assemble_ovec(void)
// construct hash table for high occurrence k-mers
if (!(asm_opt.flag & HA_F_NO_KMER_FLT) && ha_flt_tab == NULL)
{
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0);
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0, 0);
ha_opt_update_cov(&asm_opt, hom_cov);
}
// error correction
assert(asm_opt.number_of_round > 0);
ha_opt_reset_to_round(&asm_opt, r); // this update asm_opt.roundID and a few other fields
ha_overlap_cal(r);
ha_overlap_cal(r, 0);
fprintf(stderr, "[M::%s] size of buffer: %.3fGB\n", __func__, asm_opt.mem_buf / 1073741824.0);


Expand Down Expand Up @@ -1944,7 +1946,7 @@ int ha_assemble(void)
// construct hash table for high occurrence k-mers
if (!(asm_opt.flag & HA_F_NO_KMER_FLT) && ha_flt_tab == NULL)
{
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0);
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0, 0);
ha_opt_update_cov(&asm_opt, hom_cov);
}
// error correction
Expand Down Expand Up @@ -1978,3 +1980,69 @@ int ha_assemble(void)
destory_All_reads(&R_INF);
return 0;
}


int ha_assemble_pair(void)
{
extern void ha_extract_print_list(const All_reads *rs, int n_rounds, const char *o);
int r = 0, hom_cov = -1, ovlp_loaded = 0; memset((&R_INF), 0, sizeof(R_INF));

if (asm_opt.load_index_from_disk && load_all_data_from_disk(&R_INF.paf, &R_INF.reverse_paf, asm_opt.output_file_name)) {
ovlp_loaded = 1;
fprintf(stderr, "[M::%s::%.3f*%.2f] ==> loaded corrected reads and overlaps from disk\n", __func__, yak_realtime(), yak_cpu_usage());
if (asm_opt.extract_list) {
ha_extract_print_list(&R_INF, asm_opt.extract_iter, asm_opt.extract_list);
exit(0);
}
if (asm_opt.flag & HA_F_WRITE_EC) Output_corrected_reads();
if (asm_opt.flag & HA_F_WRITE_PAF) Output_PAF();
if (asm_opt.het_cov == -1024) hap_recalculate_peaks(asm_opt.output_file_name), ovlp_loaded = 2;
}


if (!ovlp_loaded) {

if(!append_All_reads(&R_INF, asm_opt.output_file_name, 0)) {
fprintf(stderr, "[M::%s::] Cannot load %s.0\n", __func__, asm_opt.output_file_name);
exit(1);
}

if(!append_All_reads(&R_INF, asm_opt.output_file_name, 1)) {
fprintf(stderr, "[M::%s::] Cannot load %s.1\n", __func__, asm_opt.output_file_name);
exit(1);
}
// Output_corrected_reads(); exit(0);

ha_flt_tab = ha_idx = NULL; r = asm_opt.number_of_round - 1;
if((asm_opt.flag & HA_F_VERBOSE_GFA)) load_pt_index(&ha_flt_tab, &ha_idx, &R_INF, &asm_opt, asm_opt.output_file_name), load_ct_index(&ha_ct_table, asm_opt.output_file_name);

// construct hash table for high occurrence k-mers
if (!(asm_opt.flag & HA_F_NO_KMER_FLT) && ha_flt_tab == NULL) {
ha_flt_tab = ha_ft_gen(&asm_opt, &R_INF, &hom_cov, 0, 1);
ha_opt_update_cov(&asm_opt, hom_cov);
}
// error correction
assert(asm_opt.number_of_round > 0);
ha_opt_reset_to_round(&asm_opt, r); // this update asm_opt.roundID and a few other fields
ha_overlap_cal(r, 1);
fprintf(stderr, "[M::%s] size of buffer: %.3fGB\n", __func__, asm_opt.mem_buf / 1073741824.0);
fprintf(stderr, "[M::%s::%.3f*%.2f@%.3fGB] ==> found overlaps for the final round\n", __func__, yak_realtime(),
yak_cpu_usage(), yak_peakrss_in_gb());


if (asm_opt.flag & HA_F_WRITE_EC) Output_corrected_reads();
ha_print_ovlp_stat(R_INF.paf, R_INF.reverse_paf, R_INF.total_reads);
ha_ft_destroy(ha_flt_tab);
if (asm_opt.flag & HA_F_WRITE_PAF) Output_PAF();
ha_triobin(&asm_opt);

}
if(ovlp_loaded == 2) ovlp_loaded = 0;
ha_opt_update_cov_min(&asm_opt, asm_opt.hom_cov, MIN_N_CHAIN);

build_string_graph_without_clean(asm_opt.min_overlap_coverage, R_INF.paf, R_INF.reverse_paf,
R_INF.total_reads, R_INF.read_length, asm_opt.min_overlap_Len, asm_opt.max_hang_Len, asm_opt.clean_round,
asm_opt.gap_fuzz, asm_opt.min_drop_rate, asm_opt.max_drop_rate, asm_opt.output_file_name, asm_opt.large_pop_bubble_size, 0, !ovlp_loaded);
destory_All_reads(&R_INF);
return 0;
}
1 change: 1 addition & 0 deletions Assembly.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ typedef struct {
} ha_ovec_buf_t;

int ha_assemble(void);
int ha_assemble_pair(void);
void ug_idx_build(ma_ug_t *ug, int hap_n);
ha_ovec_buf_t *ha_ovec_init(int is_final, int save_ov, int is_ug);
ha_ovec_buf_t *ha_ovec_buf_init(void *km, int is_final, int save_ov, int is_ug);
Expand Down
3 changes: 3 additions & 0 deletions CommandLines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ static ko_longopt_t long_options[] = {
{ "ul-cut", ko_required_argument, 349},
{ "dual-scaf", ko_no_argument, 350},
{ "scaf-gap", ko_required_argument, 351},
{ "sec-in", ko_required_argument, 352},
// { "path-round", ko_required_argument, 348},
{ 0, 0, 0 }
};
Expand Down Expand Up @@ -305,6 +306,7 @@ void init_opt(hifiasm_opt_t* asm_opt)
asm_opt->self_scaf_min = 250000;
asm_opt->self_scaf_reliable_min = 5000000;
asm_opt->self_scaf_gap_max = 3000000;
asm_opt->sec_in = NULL;
}

void destory_enzyme(enzyme* f)
Expand Down Expand Up @@ -853,6 +855,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
else if (c == 349) asm_opt->ul_min_base = atol(opt.arg);
else if (c == 350) asm_opt->self_scaf = 1;
else if (c == 351) asm_opt->self_scaf_gap_max = atol(opt.arg);
else if (c == 352) get_hic_enzymes(opt.arg, &(asm_opt->sec_in), 0);
else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap
asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg);
}
Expand Down
3 changes: 2 additions & 1 deletion CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.19.7-r598"
#define HA_VERSION "0.19.8-r602"

#define VERBOSE 0

Expand Down Expand Up @@ -45,6 +45,7 @@ typedef struct {
enzyme *hic_reads[2];
enzyme *hic_enzymes;
enzyme *ar;
enzyme *sec_in;
int extract_iter;
int thread_num;
int k_mer_length;
Expand Down
48 changes: 23 additions & 25 deletions Overlaps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,8 @@ void init_ma_hit_t_alloc(ma_hit_t_alloc* x)
x->size = 0;
x->buffer = NULL;
x->length = 0;
x->is_fully_corrected = 0;
x->is_abnormal = 0;
}

void clear_ma_hit_t_alloc(ma_hit_t_alloc* x)
Expand Down Expand Up @@ -10743,6 +10745,7 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, int print_seq, const char* prefix, FIL
uint32_t x = p->a[j]>>33;
if(x<RNF->total_reads)
{

fprintf(fp, "A\t%s\t%d\t%c\t%.*s\t%d\t%d\tid:i:%d\tHG:A:%c\n", name, l, "+-"[p->a[j]>>32&1],
(int)Get_NAME_LENGTH((*RNF), x), Get_NAME((*RNF), x),
coverage_cut?coverage_cut[x].s:0, coverage_cut?coverage_cut[x].e:(int)Get_READ_LENGTH((*RNF), x), x,
Expand Down Expand Up @@ -10817,13 +10820,33 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, const char* prefix, FILE *fp)
ma_ug_print2(ug, &R_INF, read_g, coverage_cut, sources, ruIndex, 1, prefix, fp);
}

void prt_scaf_stats(sec_t *scp, ma_ug_t *ctg, const char* prefix, uint64_t id)
{
uint64_t k, tl0, tl1; ma_utg_t *z = NULL; char name[32];
for (k = tl0 = tl1 = 0; k < scp->n; k++) {
z = &(ctg->u.a[((uint32_t)scp->a[k])>>1]); tl0 += z->len; tl1 += (scp->a[k]>>32);
}
sprintf(name, "%s%.6lu%c", prefix, id + 1, "lc"[scp->is_c]);

fprintf(stderr, "[M::%s] [M::%s] tot::%lu\tel::%lu\tnl::%lu\tis_c::%lu\t#::%lu\t%s\n", __func__, name, tl0+tl1, tl0, tl1, scp->is_c, (uint64_t)scp->n, ((scp->n>1)?"scf":"ctg"));
for (k = 0; k < scp->n; k++) {
z = &(ctg->u.a[((uint32_t)scp->a[k])>>1]);
fprintf(stderr, "%s%.6u%c(%c)\tlen::%u\tNs::%lu\n",
prefix, (((uint32_t)scp->a[k])>>1)+1, "lc"[z->circ], "+-"[(((uint32_t)scp->a[k])&1)], z->len, (scp->a[k]>>32));
}

}

void ma_scg_print(const kvect_sec_t *sc, All_reads *RNF, asg_t* read_g, const ma_sub_t *coverage_cut, ma_hit_t_alloc* sources, R_to_U* ruIndex, int print_seq, const char* prefix, FILE *fp)
{
uint8_t* primary_flag = read_g?(uint8_t*)calloc(read_g->n_seq, sizeof(uint8_t)):NULL;
uint64_t i, j, l, pc = read_g && coverage_cut && sources && ruIndex?1:0, tl, m, x; sec_t *scp;
char name[32]; uint64_t Rb, Cb, tRb, tCb, Cov; ma_utg_t *z = NULL; uint8_t c;
for (i = 0; i < sc->n; ++i) { // the Segment lines in GFA
scp = &(sc->a[i]);
///debug
prt_scaf_stats(scp, sc->ctg, prefix, i);

for (j = tl = tRb = tCb = 0; j < scp->n; j++) {
z = &(sc->ctg->u.a[((uint32_t)scp->a[j])>>1]); tl += z->len; tl += (scp->a[j]>>32);
if(pc) {
Expand Down Expand Up @@ -22760,31 +22783,6 @@ void output_read_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name
}


void read_ma(ma_hit_t* x, FILE* fp)
{
int f_flag;
f_flag = fread(&(x->qns), sizeof(x->qns), 1, fp);
f_flag += fread(&(x->qe), sizeof(x->qe), 1, fp);
f_flag += fread(&(x->tn), sizeof(x->tn), 1, fp);
f_flag += fread(&(x->ts), sizeof(x->ts), 1, fp);
f_flag += fread(&(x->te), sizeof(x->te), 1, fp);
f_flag += fread(&(x->el), sizeof(x->el), 1, fp);
f_flag += fread(&(x->no_l_indel), sizeof(x->no_l_indel), 1, fp);

uint32_t t;
f_flag += fread(&(t), sizeof(t), 1, fp);
x->ml = t;

f_flag += fread(&(t), sizeof(t), 1, fp);
x->rev = t;

f_flag += fread(&(t), sizeof(t), 1, fp);
x->bl = t;

f_flag += fread(&(t), sizeof(t), 1, fp);
x->del = t;
}

int load_ma_hit_ts(ma_hit_t_alloc** x, char* read_file_name)
{
fprintf(stderr, "Loading ma_hit_ts from disk... \n");
Expand Down
4 changes: 2 additions & 2 deletions Overlaps.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ void add_ma_hit_t_alloc(ma_hit_t_alloc* x, ma_hit_t* element);
void ma_hit_sort_tn(ma_hit_t *a, long long n);
void ma_hit_sort_qns(ma_hit_t *a, long long n);

int load_all_data_from_disk(ma_hit_t_alloc **sources, ma_hit_t_alloc **reverse_sources,
char* output_file_name);
int load_all_data_from_disk(ma_hit_t_alloc **sources, ma_hit_t_alloc **reverse_sources, char* output_file_name);


typedef struct {
uint32_t s:31, del:1, e;
Expand Down
Loading

0 comments on commit 7f6d36b

Please sign in to comment.