forked from Parsoa/SVDSS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
31 changed files
with
3,553 additions
and
3,314 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,57 @@ | ||
#include "assembler.hpp" | ||
|
||
void Assembler::run() { | ||
auto c = Configuration::getInstance(); | ||
int num_batches = c->aggregate_batches; | ||
int tau = -1; // c->cutoff; | ||
auto c = Configuration::getInstance(); | ||
int num_batches = c->aggregate_batches; | ||
int tau = -1; // c->cutoff; | ||
|
||
lprint({"Assembling high-abundance strings from", to_string(num_batches), "batches.."}); | ||
#pragma omp parallel for num_threads(c->threads) | ||
for (int j = 0; j < num_batches; j++) { | ||
lprint({"Loading batch", to_string(j) + ".."}) ; | ||
string s_j = std::to_string(j); | ||
string inpath = c->workdir + "/solution_batch_" + s_j + ".sfs"; | ||
string outpath = c->workdir + "/solution_batch_" + s_j + ".assembled.sfs"; | ||
ofstream outf(outpath); | ||
map<string, vector<SFS>> SFSs = parse_sfsfile(inpath, tau); | ||
//cout << SFSs.size() << "SFS in total." << endl ; | ||
for (map<string, vector<SFS>>::iterator it = SFSs.begin(); it != SFSs.end(); ++it) { | ||
string ridx = it->first; | ||
vector<SFS> sfs = it->second; | ||
vector<SFS> assembled_sfs = assemble(sfs); | ||
bool is_first = true; | ||
for (const SFS &sfs : assembled_sfs) { | ||
outf << (is_first ? ridx : "*") << "\t" | ||
<< "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c << endl; | ||
is_first = false; | ||
} | ||
} | ||
outf.close(); | ||
lprint({"Assembling high-abundance strings from", to_string(num_batches), | ||
"batches.."}); | ||
#pragma omp parallel for num_threads(c->threads) | ||
for (int j = 0; j < num_batches; j++) { | ||
lprint({"Loading batch", to_string(j) + ".."}); | ||
string s_j = std::to_string(j); | ||
string inpath = c->workdir + "/solution_batch_" + s_j + ".sfs"; | ||
string outpath = c->workdir + "/solution_batch_" + s_j + ".assembled.sfs"; | ||
ofstream outf(outpath); | ||
map<string, vector<SFS>> SFSs = parse_sfsfile(inpath, tau); | ||
// cout << SFSs.size() << "SFS in total." << endl ; | ||
for (map<string, vector<SFS>>::iterator it = SFSs.begin(); it != SFSs.end(); | ||
++it) { | ||
string ridx = it->first; | ||
vector<SFS> sfs = it->second; | ||
vector<SFS> assembled_sfs = assemble(sfs); | ||
bool is_first = true; | ||
for (const SFS &sfs : assembled_sfs) { | ||
outf << (is_first ? ridx : "*") << "\t" | ||
<< "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c << endl; | ||
is_first = false; | ||
} | ||
} | ||
outf.close(); | ||
} | ||
} | ||
|
||
vector<SFS> Assembler::assemble(vector<SFS> &sfs) { | ||
vector<SFS> assembled_sfs; | ||
sort(sfs.begin(), sfs.end()); | ||
int i = 0; | ||
while (i < sfs.size()) { | ||
uint j; | ||
for (j = i + 1; j < sfs.size(); ++j) { | ||
if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) { | ||
// non-overlapping | ||
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; | ||
assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); | ||
i = j; | ||
break; | ||
} | ||
} | ||
if (j == sfs.size()) { | ||
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; | ||
assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); | ||
i = j; | ||
} | ||
vector<SFS> assembled_sfs; | ||
sort(sfs.begin(), sfs.end()); | ||
int i = 0; | ||
while (i < sfs.size()) { | ||
uint j; | ||
for (j = i + 1; j < sfs.size(); ++j) { | ||
if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) { | ||
// non-overlapping | ||
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; | ||
assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); | ||
i = j; | ||
break; | ||
} | ||
} | ||
return assembled_sfs; | ||
if (j == sfs.size()) { | ||
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; | ||
assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); | ||
i = j; | ||
} | ||
} | ||
return assembled_sfs; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,134 +1,137 @@ | ||
#include "bam.hpp" | ||
|
||
using namespace std ; | ||
using namespace std; | ||
|
||
uint32_t cigar_len_mask = 0xFFFFFFF0 ; | ||
uint32_t cigar_type_mask = 0xF ; | ||
uint32_t cigar_len_mask = 0xFFFFFFF0; | ||
uint32_t cigar_type_mask = 0xF; | ||
|
||
string print_cigar_symbol(int type) { | ||
if (type == BAM_CMATCH) { | ||
return "M" ; | ||
} | ||
if (type == BAM_CINS) { | ||
return "I" ; | ||
} | ||
if (type == BAM_CDEL) { | ||
return "D" ; | ||
} | ||
if (type == BAM_CSOFT_CLIP) { | ||
return "S" ; | ||
} | ||
if (type == BAM_CHARD_CLIP) { | ||
return "H" ; | ||
} | ||
return "X" ; | ||
if (type == BAM_CMATCH) { | ||
return "M"; | ||
} | ||
if (type == BAM_CINS) { | ||
return "I"; | ||
} | ||
if (type == BAM_CDEL) { | ||
return "D"; | ||
} | ||
if (type == BAM_CSOFT_CLIP) { | ||
return "S"; | ||
} | ||
if (type == BAM_CHARD_CLIP) { | ||
return "H"; | ||
} | ||
return "X"; | ||
} | ||
|
||
vector<pair<uint32_t, uint32_t>> decode_cigar(bam1_t* read) { | ||
// get CIGAR | ||
vector<pair<uint32_t, uint32_t>> cigar_offsets ; | ||
uint32_t* cigar = bam_get_cigar(read) ; | ||
int offset = 0 ; | ||
for (int i = 0; i < read->core.n_cigar; i++) { | ||
uint32_t type = cigar[i] & cigar_type_mask ; | ||
uint32_t length = cigar[i] >> 4 ; | ||
cigar_offsets.push_back(make_pair(length, type)) ; | ||
} | ||
return cigar_offsets ; | ||
vector<pair<uint32_t, uint32_t>> decode_cigar(bam1_t *read) { | ||
// get CIGAR | ||
vector<pair<uint32_t, uint32_t>> cigar_offsets; | ||
uint32_t *cigar = bam_get_cigar(read); | ||
int offset = 0; | ||
for (int i = 0; i < read->core.n_cigar; i++) { | ||
uint32_t type = cigar[i] & cigar_type_mask; | ||
uint32_t length = cigar[i] >> 4; | ||
cigar_offsets.push_back(make_pair(length, type)); | ||
} | ||
return cigar_offsets; | ||
} | ||
|
||
uint8_t* encode_cigar(vector<pair<uint32_t, uint32_t>> cigar) { | ||
uint32_t* cigar_bytes = (uint32_t*) malloc(sizeof(uint32_t) * cigar.size()) ; | ||
for (int i = 0; i < cigar.size(); i++) { | ||
cigar_bytes[i] = (cigar[i].first << 4) | (cigar[i].second & cigar_type_mask) ; | ||
} | ||
return (uint8_t*) cigar_bytes ; | ||
uint8_t *encode_cigar(vector<pair<uint32_t, uint32_t>> cigar) { | ||
uint32_t *cigar_bytes = (uint32_t *)malloc(sizeof(uint32_t) * cigar.size()); | ||
for (int i = 0; i < cigar.size(); i++) { | ||
cigar_bytes[i] = | ||
(cigar[i].first << 4) | (cigar[i].second & cigar_type_mask); | ||
} | ||
return (uint8_t *)cigar_bytes; | ||
} | ||
|
||
uint8_t* encode_bam_seq(char* seq) { | ||
int n = (strlen(seq) + 1) >> 1 ; | ||
int l_seq = strlen(seq) ; | ||
uint8_t* seq_bytes = (uint8_t*) malloc(sizeof(uint8_t) * n) ; | ||
int i = 0 ; | ||
n = 0 ; | ||
for (i = 0; i + 1 < l_seq; i += 2) { | ||
seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; | ||
n += 1 ; | ||
} | ||
for (; i < l_seq; i++) { | ||
seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; | ||
n += 1 ; | ||
} | ||
return seq_bytes ; | ||
uint8_t *encode_bam_seq(char *seq) { | ||
int n = (strlen(seq) + 1) >> 1; | ||
int l_seq = strlen(seq); | ||
uint8_t *seq_bytes = (uint8_t *)malloc(sizeof(uint8_t) * n); | ||
int i = 0; | ||
n = 0; | ||
for (i = 0; i + 1 < l_seq; i += 2) { | ||
seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | | ||
seq_nt16_table[(unsigned char)seq[i + 1]]; | ||
n += 1; | ||
} | ||
for (; i < l_seq; i++) { | ||
seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; | ||
n += 1; | ||
} | ||
return seq_bytes; | ||
} | ||
|
||
char reverse_complement_base(char base) { | ||
if (base == 'C' || base == 'c') { | ||
return 'G' ; | ||
} | ||
if (base == 'A' || base == 'a') { | ||
return 'T' ; | ||
} | ||
if (base == 'G' || base == 'g') { | ||
return 'C' ; | ||
} | ||
if (base == 'T' || base == 't') { | ||
return 'A' ; | ||
} | ||
else { | ||
return 'N' ; | ||
} | ||
if (base == 'C' || base == 'c') { | ||
return 'G'; | ||
} | ||
if (base == 'A' || base == 'a') { | ||
return 'T'; | ||
} | ||
if (base == 'G' || base == 'g') { | ||
return 'C'; | ||
} | ||
if (base == 'T' || base == 't') { | ||
return 'A'; | ||
} else { | ||
return 'N'; | ||
} | ||
} | ||
|
||
void reverse_complement_read(char* seq) { | ||
int l = strlen(seq) ; | ||
int i = 0 ; | ||
while (i < l / 2) { | ||
auto t = reverse_complement_base(seq[l - i]) ; | ||
seq[l - 1 - i] = reverse_complement_base(seq[i]) ; | ||
seq[i] = t ; | ||
i += 1 ; | ||
} | ||
void reverse_complement_read(char *seq) { | ||
int l = strlen(seq); | ||
int i = 0; | ||
while (i < l / 2) { | ||
auto t = reverse_complement_base(seq[l - i]); | ||
seq[l - 1 - i] = reverse_complement_base(seq[i]); | ||
seq[i] = t; | ||
i += 1; | ||
} | ||
} | ||
|
||
vector<pair<int, int>> get_aligned_pairs(bam1_t *alignment) { | ||
vector<pair<int, int>> result ; | ||
uint ref_pos = alignment->core.pos ; | ||
uint read_pos = 0 ; | ||
auto cigar_offsets = decode_cigar(alignment) ; | ||
int m = 0 ; | ||
while (true) { | ||
if (m == cigar_offsets.size()) { | ||
break ; | ||
} | ||
if (cigar_offsets[m].second == BAM_CMATCH or cigar_offsets[m].second == BAM_CEQUAL or cigar_offsets[m].second == BAM_CDIFF) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(read_pos, i)); | ||
read_pos++; | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else if (cigar_offsets[m].second == BAM_CINS or cigar_offsets[m].second == BAM_CSOFT_CLIP) { | ||
for (uint i = 0; i < cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(read_pos, -1)); | ||
read_pos++; | ||
} | ||
} else if (cigar_offsets[m].second == BAM_CDEL) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(-1, i)); | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { | ||
// advances neither | ||
} else if (cigar_offsets[m].second == BAM_CREF_SKIP) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(-1, i)); | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else { //if (cigar_offsets[m].second == BAM_CPAD) { | ||
//TODO | ||
} | ||
m++ ; | ||
} | ||
return result; | ||
vector<pair<int, int>> result; | ||
uint ref_pos = alignment->core.pos; | ||
uint read_pos = 0; | ||
auto cigar_offsets = decode_cigar(alignment); | ||
int m = 0; | ||
while (true) { | ||
if (m == cigar_offsets.size()) { | ||
break; | ||
} | ||
if (cigar_offsets[m].second == BAM_CMATCH or | ||
cigar_offsets[m].second == BAM_CEQUAL or | ||
cigar_offsets[m].second == BAM_CDIFF) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(read_pos, i)); | ||
read_pos++; | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else if (cigar_offsets[m].second == BAM_CINS or | ||
cigar_offsets[m].second == BAM_CSOFT_CLIP) { | ||
for (uint i = 0; i < cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(read_pos, -1)); | ||
read_pos++; | ||
} | ||
} else if (cigar_offsets[m].second == BAM_CDEL) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(-1, i)); | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { | ||
// advances neither | ||
} else if (cigar_offsets[m].second == BAM_CREF_SKIP) { | ||
for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { | ||
result.push_back(make_pair(-1, i)); | ||
} | ||
ref_pos += cigar_offsets[m].first; | ||
} else { // if (cigar_offsets[m].second == BAM_CPAD) { | ||
// TODO | ||
} | ||
m++; | ||
} | ||
return result; | ||
} | ||
|
Oops, something went wrong.