diff --git a/assembler.cpp b/assembler.cpp index ff0f090..17b75f2 100644 --- a/assembler.cpp +++ b/assembler.cpp @@ -38,17 +38,17 @@ vector Assembler::assemble(vector &sfs) { while (i < sfs.size()) { size_t j; for (j = i + 1; j < sfs.size(); ++j) { - if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) { + if (sfs[j - 1].qs + sfs[j - 1].l <= sfs[j].qs) { // non-overlapping - uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; - assembled_sfs.push_back(SFS(sfs[i].s, l, sfs[i].htag)); + uint l = sfs[j - 1].qs + sfs[j - 1].l - sfs[i].qs; + assembled_sfs.push_back(SFS(sfs[i].qname, sfs[i].qs, l, sfs[i].htag)); i = j; break; } } if (j == sfs.size()) { - uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; - assembled_sfs.push_back(SFS(sfs[i].s, l, sfs[i].htag)); + uint l = sfs[j - 1].qs + sfs[j - 1].l - sfs[i].qs; + assembled_sfs.push_back(SFS(sfs[i].qname, sfs[i].qs, l, sfs[i].htag)); i = j; } } diff --git a/clusterer.cpp b/clusterer.cpp index 098f240..c45c0d3 100644 --- a/clusterer.cpp +++ b/clusterer.cpp @@ -168,10 +168,10 @@ void Clusterer::extend_alignment(bam1_t *aln, int index) { pair lclip = make_pair(0, 0); pair rclip = make_pair(0, 0); int last_pos = 0; - vector local_extended_sfs; + vector local_extended_sfs; for (const SFS &sfs : SFSs->at(qname)) { - int s = sfs.s; - int e = sfs.s + sfs.l - 1; + int s = sfs.qs; + int e = sfs.qs + sfs.l - 1; int hp_tag = sfs.htag; int aln_start = -1; int aln_end = -1; @@ -297,33 +297,33 @@ void Clusterer::extend_alignment(bam1_t *aln, int index) { } // FIXME: understand why this is happening (chr16 on full giab genome) if ((uint)prekmer.second > postkmer.second + config->ksize) { - spdlog::warn("Error on {}. SFS starting at {} (length {})", qname, sfs.s, + spdlog::warn("Error on {}. SFS starting at {} (length {})", qname, sfs.qs, sfs.l); } else { local_extended_sfs.push_back( - ExtSFS(string(chrom), string(qname), prekmer.second, - postkmer.second + config->ksize, prekmer.first, - postkmer.first + config->ksize, hp_tag)); + SFS(string(chrom), string(qname), prekmer.second, + postkmer.second + config->ksize, prekmer.first, + postkmer.first + config->ksize, hp_tag)); } } // When two SFSs are close but not overlapping, we may end up with two // overlapping extended SFSs. We need to merge SFSs two by two until we don't // need to merge anything - vector merged_extended_sfs; + vector merged_extended_sfs; for (size_t i = 0; i < local_extended_sfs.size(); ++i) { size_t j; for (j = 0; j < merged_extended_sfs.size(); ++j) { - if ((local_extended_sfs.at(i).s <= merged_extended_sfs.at(j).s && - merged_extended_sfs.at(j).s <= local_extended_sfs.at(i).e) || - (merged_extended_sfs.at(j).s <= local_extended_sfs.at(i).s && - local_extended_sfs.at(i).s <= merged_extended_sfs.at(j).e)) + if ((local_extended_sfs.at(i).rs <= merged_extended_sfs.at(j).rs && + merged_extended_sfs.at(j).rs <= local_extended_sfs.at(i).re) || + (merged_extended_sfs.at(j).rs <= local_extended_sfs.at(i).rs && + local_extended_sfs.at(i).rs <= merged_extended_sfs.at(j).re)) break; } if (j < merged_extended_sfs.size()) { - merged_extended_sfs[j].s = - min(merged_extended_sfs.at(j).s, local_extended_sfs.at(i).s); - merged_extended_sfs[j].e = - max(merged_extended_sfs.at(j).e, local_extended_sfs.at(i).e); + merged_extended_sfs[j].rs = + min(merged_extended_sfs.at(j).rs, local_extended_sfs.at(i).rs); + merged_extended_sfs[j].re = + max(merged_extended_sfs.at(j).re, local_extended_sfs.at(i).re); merged_extended_sfs[j].qs = min(merged_extended_sfs.at(j).qs, local_extended_sfs.at(i).qs); merged_extended_sfs[j].qe = @@ -405,16 +405,16 @@ Clusterer::get_unique_kmers(const vector> &alpairs, const uint k, void Clusterer::cluster_by_proximity() { sort(extended_SFSs.begin(), extended_SFSs.end()); auto r = max_element(extended_SFSs.begin(), extended_SFSs.end(), - [](const ExtSFS &lhs, const ExtSFS &rhs) { - return lhs.e - lhs.s < rhs.e - rhs.s; + [](const SFS &lhs, const SFS &rhs) { + return lhs.re - lhs.rs < rhs.re - rhs.rs; }); - int dist = (r->e - r->s) * 1.1; // TODO: add to CLI + int dist = (r->re - r->rs) * 1.1; // TODO: add to CLI spdlog::info( "Maximum extended SFS length: {}bp. Using separation distance: {}bp.", - r->e - r->s, dist); + r->re - r->rs, dist); // Cluster SFSs inside dist-bp windows int prev_i = 0; - int prev_e = extended_SFSs[0].e; + int prev_e = extended_SFSs[0].re; string prev_chrom = extended_SFSs[0].chrom; vector> intervals; for (size_t i = 1; i < extended_SFSs.size(); i++) { @@ -424,12 +424,12 @@ void Clusterer::cluster_by_proximity() { prev_chrom = sfs.chrom; intervals.push_back(make_pair(prev_i, i - 1)); prev_i = i; - prev_e = sfs.e; + prev_e = sfs.re; continue; } else { - if (sfs.s - prev_e > dist) { + if (sfs.rs - prev_e > dist) { intervals.push_back(make_pair(prev_i, i - 1)); - prev_e = sfs.e; + prev_e = sfs.re; prev_i = i; } } @@ -438,28 +438,28 @@ void Clusterer::cluster_by_proximity() { // Cluster SFS inside each interval _p_sfs_clusters.resize( - config->threads); // vector, vector>> + config->threads); // vector, vector>> #pragma omp parallel for num_threads(config->threads) schedule(static, 1) for (size_t i = 0; i < intervals.size(); i++) { int t = omp_get_thread_num(); int j = intervals[i].first; - int low = extended_SFSs[j].s; - int high = extended_SFSs[j].e; + int low = extended_SFSs[j].rs; + int high = extended_SFSs[j].re; int last_j = j; j++; for (; j <= intervals[i].second; j++) { - const ExtSFS &sfs = extended_SFSs[j]; - if (sfs.s <= high) { - low = min(low, sfs.s); - high = max(high, sfs.e); + const SFS &sfs = extended_SFSs[j]; + if (sfs.rs <= high) { + low = min(low, sfs.rs); + high = max(high, sfs.re); } else { for (int k = last_j; k < j; k++) { // CHECKME: < or <=? // NOTE: <= makes the code waaaay slower _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_SFSs[k]); } - low = sfs.s; - high = sfs.e; + low = sfs.rs; + high = sfs.re; last_j = j; } } @@ -499,9 +499,9 @@ void Clusterer::fill_clusters() { set reads; int min_s = numeric_limits::max(); int max_e = 0; - for (const ExtSFS &sfs : cluster.SFSs) { - min_s = min(min_s, sfs.s); - max_e = max(max_e, sfs.e); + for (const SFS &sfs : cluster.SFSs) { + min_s = min(min_s, sfs.rs); + max_e = max(max_e, sfs.re); reads.insert(sfs.qname); } diff --git a/clusterer.hpp b/clusterer.hpp index 4ff6d78..1382cf7 100644 --- a/clusterer.hpp +++ b/clusterer.hpp @@ -26,13 +26,13 @@ struct Cluster { int s; int e; int cov; - vector SFSs; + vector SFSs; vector names; vector seqs; Cluster(){}; - Cluster(const vector &_SFSs) { + Cluster(const vector &_SFSs) { SFSs = _SFSs; chrom = _SFSs[0].chrom; } @@ -78,7 +78,7 @@ class Clusterer { private: Configuration *config; unordered_map> *SFSs; - vector extended_SFSs; + vector extended_SFSs; samFile *bam_file; hts_idx_t *bam_index; bam_hdr_t *bam_header; @@ -107,9 +107,9 @@ class Clusterer { // parallelize vector> _p_clips; - vector> _p_extended_sfs; + vector> _p_extended_sfs; vector>> bam_entries; - vector, vector>> _p_sfs_clusters; + vector, vector>> _p_sfs_clusters; public: Clusterer(unordered_map> *); diff --git a/ping_pong.cpp b/ping_pong.cpp index a67449a..4756b57 100644 --- a/ping_pong.cpp +++ b/ping_pong.cpp @@ -13,8 +13,8 @@ void seq_char2nt6(int l, unsigned char *s) { } /* Compute SFS strings from P and store them into solutions*/ -void PingPong::ping_pong_search(rld_t *index, uint8_t *P, int l, - vector &solutions, int hp_tag) { +void PingPong::ping_pong_search(rld_t *index, const string &qname, uint8_t *P, + int l, vector &solutions, int hp_tag) { rldintv_t sai; int begin = l - 1; while (begin >= 0) { @@ -58,16 +58,14 @@ void PingPong::ping_pong_search(rld_t *index, uint8_t *P, int l, // fmatches: " << fmatches << endl ;) DEBUG(cerr << "Adding [" << begin << // ", " << end << "]." << endl ;) int sfs_len = end - begin + 1; - SFS sfs = SFS{begin, sfs_len, hp_tag}; + SFS sfs(qname, begin, sfs_len, hp_tag); solutions.push_back(sfs); - if (begin == 0) { + if (begin == 0) break; - } - if (config->overlap == 0) { // Relaxed + if (config->overlap == 0) // Relaxed begin -= 1; - } else { + else begin = end + config->overlap; // overlap < 0 - } } } @@ -191,7 +189,7 @@ batch_type_t PingPong::process_batch(rld_t *index, int p, int thread) { if (!bam_mode) { for (uint j = 0; j < read_seqs[p][thread].size(); j++) { ping_pong_search( - index, read_seqs[p][thread][j], + index, read_names[p][thread][j], read_seqs[p][thread][j], strlen( (char *)read_seqs[p][thread] [j]), // FIXME: this may be inefficient. We were @@ -212,7 +210,7 @@ batch_type_t PingPong::process_batch(rld_t *index, int p, int thread) { : 0; if (config->putative and xf_t != 0) continue; - ping_pong_search(index, read_seqs[p][thread][j], aln->core.l_qseq, + ping_pong_search(index, qname, read_seqs[p][thread][j], aln->core.l_qseq, solutions[qname], hp_t); } } @@ -235,7 +233,7 @@ void PingPong::output_batch(int b) { for (const SFS &sfs : assembled_SFSs) { // optimize file output size by not outputting read name for every // SFS - cout << (is_first ? read.first : "*") << "\t" << sfs.s << "\t" + cout << (is_first ? read.first : "*") << "\t" << sfs.qs << "\t" << sfs.l << "\t" << sfs.htag << "\t" << endl; is_first = false; } diff --git a/ping_pong.hpp b/ping_pong.hpp index 4c1f530..887b4c1 100644 --- a/ping_pong.hpp +++ b/ping_pong.hpp @@ -67,8 +67,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s) { return l; } -typedef SFS sfs_type_t; -typedef map> batch_type_t; +typedef map> batch_type_t; static const vector int2char({"$", "A", "C", "G", "T", "N"}); @@ -95,11 +94,11 @@ class PingPong { vector>> read_names; vector>> bam_entries; vector>> fastq_entries; - bool load_batch_bam(int p); - bool load_batch_fastq(int threads, int batch_size, int p); - batch_type_t process_batch(rld_t *index, int p, int i); - void ping_pong_search(rld_t *index, uint8_t *seq, int l, - vector &solutions, int hp); + bool load_batch_bam(int); + bool load_batch_fastq(int, int, int); + batch_type_t process_batch(rld_t *, int, int); + void ping_pong_search(rld_t *, const string &, uint8_t *, int, vector &, + int); void output_batch(int); vector> obatches; diff --git a/sfs.cpp b/sfs.cpp index 6e7b916..e185f6a 100644 --- a/sfs.cpp +++ b/sfs.cpp @@ -1,7 +1,5 @@ #include "sfs.hpp" -bool operator<(const SFS &x, const SFS &y) { return x.s < y.s; } - // TODO: if we split the .sfs into more files, we can load it using more threads // (but for SVs having a single file shouldn't be the bottleneck) unordered_map> parse_sfsfile(const string &sfs_path) { @@ -22,9 +20,8 @@ unordered_map> parse_sfsfile(const string &sfs_path) { read_name = info[0]; SFSs[read_name] = vector(); } - // TODO SFSs[read_name].push_back( - SFS(stoi(info[1]), stoi(info[2]), stoi(info[3]))); + SFS(read_name, stoi(info[1]), stoi(info[2]), stoi(info[3]))); ++total; } } diff --git a/sfs.hpp b/sfs.hpp index e6c0132..5c60b1b 100644 --- a/sfs.hpp +++ b/sfs.hpp @@ -29,56 +29,52 @@ static const char RCN[128] = { }; struct SFS { - int s; - int l; - int htag; - - SFS() { - s = 0; - l = 0; - htag = 0; - } - - SFS(int s_, int l_, int htag_) { - s = s_; - l = l_; - htag = htag_; - } - - // void reverse(int p) { s = p - s - l; } -}; - -bool operator<(const SFS &, const SFS &); - -struct ExtSFS { string chrom; string qname; - // reference positions - int s; - int e; - // query positions + // reference and query positions, length + int rs; + int re; int qs; int qe; + int l; // other int htag; - ExtSFS(const string &_chrom, const string &_qname, int _s, int _e, int _qs, - int _qe, int _htag) { + SFS(const string &_qname, int _qs, int _l, int _htag) { + chrom = ""; + qname = _qname; + qs = _qs; + qe = _qs + _l; + l = _l; + htag = _htag; + } + + SFS(const string &_chrom, const string &_qname, int _rs, int _re, int _qs, + int _qe, int _htag) { chrom = _chrom; qname = _qname; - s = _s; - e = _e; + rs = _rs; + re = _re; qs = _qs; qe = _qe; + l = qe - qs + 1; htag = _htag; } - bool operator<(const ExtSFS &c) const { - return chrom == c.chrom ? s < c.s : chrom < c.chrom; + // void reverse(int p) { s = p - s - l; } + + bool operator<(const SFS &c) const { + // FIXME: bad trick to manage both cases we can have (in assembler, we have + // to check for query positions and we are sure we do not have a chromosome + // there. On caller, where we have the chrom, for reference positions) + if (chrom == "" || c.chrom == "") + return qs < c.qs; + else + return chrom == c.chrom ? rs < c.rs : chrom < c.chrom; } - bool operator==(const ExtSFS &c) const { - return chrom == c.chrom and s == c.s and e == c.e; + bool operator==(const SFS &c) const { + return chrom == c.chrom and rs == c.rs and re == c.re; } }; diff --git a/sv.hpp b/sv.hpp index 1bdc14f..c0bfdee 100644 --- a/sv.hpp +++ b/sv.hpp @@ -29,10 +29,10 @@ class SV { string reads; SV(); - SV(const string type_, const string &chrom_, uint s_, - const string &refall_, const string &altall_, const uint w_, - const uint cov_, const int ngaps_, const int score_, - bool imprecise_ = false, uint l_ = 0, string cigar_ = "."); + SV(const string type_, const string &chrom_, uint s_, const string &refall_, + const string &altall_, const uint w_, const uint cov_, const int ngaps_, + const int score_, bool imprecise_ = false, uint l_ = 0, + string cigar_ = "."); void add_reads(const vector &reads_); void genotype();