Skip to content

Commit

Permalink
expose purge_dups
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Apr 18, 2020
1 parent 85c57fd commit 7f6725e
Showing 6 changed files with 116 additions and 35 deletions.
40 changes: 37 additions & 3 deletions CommandLines.cpp
Original file line number Diff line number Diff line change
@@ -15,7 +15,6 @@ static ko_longopt_t long_options[] = {
{ "write-paf", ko_no_argument, 302 },
{ "write-ec", ko_no_argument, 303 },
{ "skip-triobin", ko_no_argument, 304 },
{ "purge-trio", ko_no_argument, 305 },
{ 0, 0, 0 }
};

@@ -58,6 +57,15 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " -c INT lower bound of the binned k-mer's frequency [%d]\n", asm_opt->min_cnt);
fprintf(stderr, " -d INT upper bound of the binned k-mer's frequency [%d]\n", asm_opt->mid_cnt);

fprintf(stderr, " Purge-dups:\n");
fprintf(stderr, " -l INT level of purge-dup. In default, [%d] for non-trio; [%d] for trio (see hifiasm.1 for details)\n",
asm_opt->purge_level_primary, asm_opt->purge_level_trio);
fprintf(stderr, " -s FLOAT similarity threshold for duplicate haplotigs [%g]\n",
asm_opt->purge_simi_rate);
fprintf(stderr, " -O INT min number of overlapped reads for duplicate haplotigs [%d]\n",
asm_opt->purge_overlap_len);


fprintf(stderr, "Example: ./hifiasm -o NA12878.asm -t 32 NA12878.fq.gz\n");
fprintf(stderr, "See `man ./hifiasm.1' for detailed description of these command-line options.\n");
}
@@ -97,6 +105,10 @@ void init_opt(hifiasm_opt_t* asm_opt)
asm_opt->max_short_tip = 3;
asm_opt->min_cnt = 2;
asm_opt->mid_cnt = 5;
asm_opt->purge_level_primary = 2;
asm_opt->purge_level_trio = 0;
asm_opt->purge_simi_rate = 0.75;
asm_opt->purge_overlap_len = 1;
}

void destory_opt(hifiasm_opt_t* asm_opt)
@@ -259,6 +271,19 @@ int check_option(hifiasm_opt_t* asm_opt)
return 0;
}

if(asm_opt->purge_level_primary < 0 || asm_opt->purge_level_primary > 2)
{
fprintf(stderr, "[ERROR] the level of purge-dup should be [0, 2] (-l)\n");
return 0;
}

if(ha_opt_triobin(asm_opt) && ((asm_opt->purge_level_trio < 0 || asm_opt->purge_level_trio > 1)))
{
fprintf(stderr, "[ERROR] the level of purge-dup for trio should be [0, 1] (-l)\n");
return 0;
}



if(asm_opt->fn_bin_yak[0] != NULL && check_file(asm_opt->fn_bin_yak[0], "YAK1") == 0) return 0;
if(asm_opt->fn_bin_yak[1] != NULL && check_file(asm_opt->fn_bin_yak[1], "YAK2") == 0) return 0;
@@ -280,6 +305,10 @@ int check_option(hifiasm_opt_t* asm_opt)
// fprintf(stderr, "small removed unitig threshold: %d\n", asm_opt->max_short_tip);
// fprintf(stderr, "min_cnt: %d\n", asm_opt->min_cnt);
// fprintf(stderr, "mid_cnt: %d\n", asm_opt->mid_cnt);
// fprintf(stderr, "purge_level_primary: %d\n", asm_opt->purge_level_primary);
// fprintf(stderr, "purge_level_trio: %d\n", asm_opt->purge_level_trio);
// fprintf(stderr, "purge_simi_rate: %f\n", asm_opt->purge_simi_rate);
// fprintf(stderr, "purge_overlap_len: %d\n", asm_opt->purge_overlap_len);

return 1;
}
@@ -317,7 +346,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)

int c;

while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:", long_options)) >= 0) {
while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:l:s:O:", long_options)) >= 0) {
if (c == 'h')
{
Print_H(asm_opt);
@@ -356,7 +385,12 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
else if (c == 302) asm_opt->flag |= HA_F_WRITE_PAF;
else if (c == 303) asm_opt->flag |= HA_F_WRITE_EC;
else if (c == 304) asm_opt->flag |= HA_F_SKIP_TRIOBIN;
else if (c == 305) asm_opt->flag |= HA_F_PURGE_TRIO;
else if (c == 'l')
{ ///0: disable purge_dup; 1: purge containment; 2: purge overlap
asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg);
}
else if (c == 's') asm_opt->purge_simi_rate = atof(opt.arg);
else if (c == 'O') asm_opt->purge_overlap_len = atoll(opt.arg);
else if (c == ':')
{
fprintf(stderr, "[ERROR] missing option argument in \"%s\"\n", argv[opt.i - 1]);
9 changes: 7 additions & 2 deletions CommandLines.h
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@

#include <pthread.h>

#define HA_VERSION "0.5-dirty-r246"
#define HA_VERSION "0.5-dirty-r247"

#define VERBOSE 0

@@ -13,7 +13,8 @@
#define HA_F_WRITE_EC 0x8
#define HA_F_WRITE_PAF 0x10
#define HA_F_SKIP_TRIOBIN 0x20
#define HA_F_PURGE_TRIO 0x40
#define HA_F_PURGE_CONTAIN 0x40
#define HA_F_PURGE_JOIN 0x80

typedef struct {
int flag;
@@ -45,10 +46,14 @@ typedef struct {
int max_short_tip;
int min_cnt;
int mid_cnt;
int purge_level_primary;
int purge_level_trio;
int purge_overlap_len;

float max_hang_rate;
float min_drop_rate;
float max_drop_rate;
float purge_simi_rate;

long long small_pop_bubble_size;
long long large_pop_bubble_size;
52 changes: 29 additions & 23 deletions Overlaps.cpp
Original file line number Diff line number Diff line change
@@ -13223,9 +13223,11 @@ kvec_asg_arc_t_warp* new_rtg_edges)

delete_useless_nodes(ug);

if (asm_opt.flag & HA_F_PURGE_TRIO) {
purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges, 0.75, 50, 50, 0.5, max_hang,
min_ovlp, bubble_dist, drop_ratio, 1);
if(asm_opt.purge_level_trio == 1)
{
purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges,
asm_opt.purge_simi_rate, asm_opt.purge_overlap_len, max_hang, min_ovlp, bubble_dist,
drop_ratio, 1);
delete_useless_nodes(ug);
}

@@ -21975,7 +21977,7 @@ R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ov
kvec_asg_arc_t_warp* new_rtg_edges)
{
asg_t* nsg = (*ug)->g;
uint32_t v, n_vtx = nsg->n_seq, k, rId;
uint32_t v, n_vtx = nsg->n_seq, k, rId, just_contain;
ma_utg_t* u = NULL;


@@ -22001,20 +22003,22 @@ kvec_asg_arc_t_warp* new_rtg_edges)
tip_drop_ratio, stops_threshold, ruIndex, NULL, NULL, 0, 0, 0,
chimeric_rate, 0, 0, drop_ratio);

///delete_useless_nodes(ug);
delete_useless_nodes(ug);

///deduplicate_advance(*ug, read_g, coverage_cut, sources, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, 1);

delete_useless_nodes(ug);

renew_utg(ug, read_g, new_rtg_edges);

if(asm_opt.purge_level_primary > 0)
{
just_contain = 0;
if(asm_opt.purge_level_primary == 1) just_contain = 1;

///enable_debug_mode(1);
purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges, 0.75, 50, 50, 0.5, max_hang,
min_ovlp, bubble_dist, drop_ratio, 0);
delete_useless_nodes(ug);
renew_utg(ug, read_g, new_rtg_edges);
purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges,
asm_opt.purge_simi_rate, asm_opt.purge_overlap_len, max_hang, min_ovlp, bubble_dist,
drop_ratio, just_contain);
delete_useless_nodes(ug);
renew_utg(ug, read_g, new_rtg_edges);
}



rescue_missing_overlaps_aggressive(*ug, read_g, sources, coverage_cut, ruIndex, max_hang,
@@ -22026,16 +22030,18 @@ kvec_asg_arc_t_warp* new_rtg_edges)
renew_utg(ug, read_g, new_rtg_edges);


///debug_purge_dup = 1;
///deduplicate_advance(*ug, read_g, coverage_cut, sources, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, 0);
///delete_useless_nodes(ug);
///enable_debug_mode(0);
purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges, 0.75, 50, 50, 0.5, max_hang,
min_ovlp, bubble_dist, drop_ratio, 0);
delete_useless_nodes(ug);
renew_utg(ug, read_g, new_rtg_edges);

if(asm_opt.purge_level_primary > 0)
{
just_contain = 0;
if(asm_opt.purge_level_primary == 1) just_contain = 1;

purge_dups(*ug, read_g, coverage_cut, reverse_sources, ruIndex, new_rtg_edges,
asm_opt.purge_simi_rate, asm_opt.purge_overlap_len, max_hang, min_ovlp, bubble_dist,
drop_ratio, just_contain);
delete_useless_nodes(ug);
renew_utg(ug, read_g, new_rtg_edges);
}


n_vtx = read_g->n_seq;
for (v = 0; v < n_vtx; v++)
27 changes: 24 additions & 3 deletions Purge_Dups.cpp
Original file line number Diff line number Diff line change
@@ -2777,6 +2777,25 @@ ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* reverse_sources, R_to_U* ruIndex)
}
}

void filter_hap_overlaps_by_length(hap_overlaps_list* all_ovlp, uint32_t minLen)
{
hap_overlaps *x = NULL;
uint32_t v, i, m, uId;

for (v = 0; v < all_ovlp->num; v++)
{
uId = v;
m = 0;
for (i = 0; i < all_ovlp->x[uId].a.n; i++)
{
x = &(all_ovlp->x[uId].a.a[i]);
if(x->x_end_id - x->x_beg_id < minLen) continue;
all_ovlp->x[uId].a.a[m] = (*x);
m++;
}
all_ovlp->x[uId].a.n = m;
}
}

void debug_hap_overlaps(hap_overlaps_list* all_ovlp, hap_overlaps_list* back_all_ovlp)
{
@@ -3569,9 +3588,8 @@ void print_all_purge_ovlp(ma_ug_t *ug, hap_overlaps_list* all_ovlp)

}
void purge_dups(ma_ug_t *ug, asg_t *read_g, ma_sub_t* coverage_cut, ma_hit_t_alloc* reverse_sources,
R_to_U* ruIndex, kvec_asg_arc_t_warp* edge, float density, uint32_t bi_graph_Len, uint32_t long_hap_overlap,
float lable_match_rate, int max_hang, int min_ovlp, long long bubble_dist, float drop_ratio,
uint32_t just_contain)
R_to_U* ruIndex, kvec_asg_arc_t_warp* edge, float density, uint32_t purege_minLen, int max_hang,
int min_ovlp, long long bubble_dist, float drop_ratio, uint32_t just_contain)
{
asg_t *purge_g = NULL;
purge_g = asg_init();
@@ -3651,6 +3669,9 @@ uint32_t just_contain)
// 0.05, &all_ovlp);
// }


filter_hap_overlaps_by_length(&all_ovlp, purege_minLen);

///normalize_hap_overlaps(&all_ovlp, &back_all_ovlp);
normalize_hap_overlaps_advance(&all_ovlp, &back_all_ovlp, ug, read_g, reverse_sources, ruIndex);
///debug_hap_overlaps(&all_ovlp, &back_all_ovlp);
5 changes: 2 additions & 3 deletions Purge_Dups.h
Original file line number Diff line number Diff line change
@@ -8,9 +8,8 @@
#include "Hash_Table.h"

void purge_dups(ma_ug_t *ug, asg_t *read_g, ma_sub_t* coverage_cut, ma_hit_t_alloc* reverse_sources,
R_to_U* ruIndex, kvec_asg_arc_t_warp* edge, float density, uint32_t bi_graph_Len, uint32_t long_hap_overlap,
float lable_match_rate, int max_hang, int min_ovlp, long long bubble_dist, float drop_ratio,
uint32_t just_contain);
R_to_U* ruIndex, kvec_asg_arc_t_warp* edge, float density, uint32_t purege_minLen, int max_hang,
int min_ovlp, long long bubble_dist, float drop_ratio, uint32_t just_contain);
void fill_unitig(uint64_t* buffer, uint32_t bufferLen, asg_t* read_g, kvec_asg_arc_t_warp* edge,
uint32_t is_circle, uint64_t* rLen);

18 changes: 17 additions & 1 deletion hifiasm.1
Original file line number Diff line number Diff line change
@@ -236,11 +236,27 @@ but occurs <
times in the other sample.


.SS Purge-dups options

.TP 10
.BI -l \ INT
Level of purge-dup. 0 to disable purge-dup, 1 to only purge contained haplotigs,
2 to purge all types of haplotigs. In default, [2] for non-trio assembly, [0] for trio assembly.
For trio assembly, only level 0 and level 1 are allowed.

.TP
.BI -s \ FLOAT
Similarity threshold for duplicate haplotigs that should be purged [0.75].

.TP
.BI -O \ FLOAT
Min number of overlapped reads for duplicate haplotigs that should be purged [1].

.SS Debugging options

.TP 10
.B --dbg-gfa
Write additional files to speed up the debugging of graph cleaning
Write additional files to speed up the debugging of graph cleaning.


.SH OUTPUTS

0 comments on commit 7f6725e

Please sign in to comment.