diff --git a/assembler.cpp b/assembler.cpp index 63048ed..d3fe96b 100644 --- a/assembler.cpp +++ b/assembler.cpp @@ -1,55 +1,57 @@ #include "assembler.hpp" void Assembler::run() { - auto c = Configuration::getInstance(); - int num_batches = c->aggregate_batches; - int tau = -1; // c->cutoff; + auto c = Configuration::getInstance(); + int num_batches = c->aggregate_batches; + int tau = -1; // c->cutoff; - lprint({"Assembling high-abundance strings from", to_string(num_batches), "batches.."}); - #pragma omp parallel for num_threads(c->threads) - for (int j = 0; j < num_batches; j++) { - lprint({"Loading batch", to_string(j) + ".."}) ; - string s_j = std::to_string(j); - string inpath = c->workdir + "/solution_batch_" + s_j + ".sfs"; - string outpath = c->workdir + "/solution_batch_" + s_j + ".assembled.sfs"; - ofstream outf(outpath); - map> SFSs = parse_sfsfile(inpath, tau); - //cout << SFSs.size() << "SFS in total." << endl ; - for (map>::iterator it = SFSs.begin(); it != SFSs.end(); ++it) { - string ridx = it->first; - vector sfs = it->second; - vector assembled_sfs = assemble(sfs); - bool is_first = true; - for (const SFS &sfs : assembled_sfs) { - outf << (is_first ? ridx : "*") << "\t" - << "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c << endl; - is_first = false; - } - } - outf.close(); + lprint({"Assembling high-abundance strings from", to_string(num_batches), + "batches.."}); +#pragma omp parallel for num_threads(c->threads) + for (int j = 0; j < num_batches; j++) { + lprint({"Loading batch", to_string(j) + ".."}); + string s_j = std::to_string(j); + string inpath = c->workdir + "/solution_batch_" + s_j + ".sfs"; + string outpath = c->workdir + "/solution_batch_" + s_j + ".assembled.sfs"; + ofstream outf(outpath); + map> SFSs = parse_sfsfile(inpath, tau); + // cout << SFSs.size() << "SFS in total." << endl ; + for (map>::iterator it = SFSs.begin(); it != SFSs.end(); + ++it) { + string ridx = it->first; + vector sfs = it->second; + vector assembled_sfs = assemble(sfs); + bool is_first = true; + for (const SFS &sfs : assembled_sfs) { + outf << (is_first ? ridx : "*") << "\t" + << "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c << endl; + is_first = false; + } } + outf.close(); + } } vector Assembler::assemble(vector &sfs) { - vector assembled_sfs; - sort(sfs.begin(), sfs.end()); - int i = 0; - while (i < sfs.size()) { - uint j; - for (j = i + 1; j < sfs.size(); ++j) { - if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) { - // non-overlapping - uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; - assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); - i = j; - break; - } - } - if (j == sfs.size()) { - uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; - assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); - i = j; - } + vector assembled_sfs; + sort(sfs.begin(), sfs.end()); + int i = 0; + while (i < sfs.size()) { + uint j; + for (j = i + 1; j < sfs.size(); ++j) { + if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) { + // non-overlapping + uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; + assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); + i = j; + break; + } } - return assembled_sfs; + if (j == sfs.size()) { + uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s; + assembled_sfs.push_back(SFS(sfs[i].s, l, 1, sfs[i].isreversed)); + i = j; + } + } + return assembled_sfs; } diff --git a/assembler.hpp b/assembler.hpp index 65ef378..30a335f 100644 --- a/assembler.hpp +++ b/assembler.hpp @@ -1,11 +1,11 @@ #ifndef ASSEMBLER_HPP #define ASSEMBLER_HPP -#include #include +#include -#include "sfs.hpp" #include "config.hpp" +#include "sfs.hpp" using namespace std; diff --git a/bam.cpp b/bam.cpp index e044566..7747cfb 100644 --- a/bam.cpp +++ b/bam.cpp @@ -1,134 +1,137 @@ #include "bam.hpp" -using namespace std ; +using namespace std; -uint32_t cigar_len_mask = 0xFFFFFFF0 ; -uint32_t cigar_type_mask = 0xF ; +uint32_t cigar_len_mask = 0xFFFFFFF0; +uint32_t cigar_type_mask = 0xF; string print_cigar_symbol(int type) { - if (type == BAM_CMATCH) { - return "M" ; - } - if (type == BAM_CINS) { - return "I" ; - } - if (type == BAM_CDEL) { - return "D" ; - } - if (type == BAM_CSOFT_CLIP) { - return "S" ; - } - if (type == BAM_CHARD_CLIP) { - return "H" ; - } - return "X" ; + if (type == BAM_CMATCH) { + return "M"; + } + if (type == BAM_CINS) { + return "I"; + } + if (type == BAM_CDEL) { + return "D"; + } + if (type == BAM_CSOFT_CLIP) { + return "S"; + } + if (type == BAM_CHARD_CLIP) { + return "H"; + } + return "X"; } -vector> decode_cigar(bam1_t* read) { - // get CIGAR - vector> cigar_offsets ; - uint32_t* cigar = bam_get_cigar(read) ; - int offset = 0 ; - for (int i = 0; i < read->core.n_cigar; i++) { - uint32_t type = cigar[i] & cigar_type_mask ; - uint32_t length = cigar[i] >> 4 ; - cigar_offsets.push_back(make_pair(length, type)) ; - } - return cigar_offsets ; +vector> decode_cigar(bam1_t *read) { + // get CIGAR + vector> cigar_offsets; + uint32_t *cigar = bam_get_cigar(read); + int offset = 0; + for (int i = 0; i < read->core.n_cigar; i++) { + uint32_t type = cigar[i] & cigar_type_mask; + uint32_t length = cigar[i] >> 4; + cigar_offsets.push_back(make_pair(length, type)); + } + return cigar_offsets; } -uint8_t* encode_cigar(vector> cigar) { - uint32_t* cigar_bytes = (uint32_t*) malloc(sizeof(uint32_t) * cigar.size()) ; - for (int i = 0; i < cigar.size(); i++) { - cigar_bytes[i] = (cigar[i].first << 4) | (cigar[i].second & cigar_type_mask) ; - } - return (uint8_t*) cigar_bytes ; +uint8_t *encode_cigar(vector> cigar) { + uint32_t *cigar_bytes = (uint32_t *)malloc(sizeof(uint32_t) * cigar.size()); + for (int i = 0; i < cigar.size(); i++) { + cigar_bytes[i] = + (cigar[i].first << 4) | (cigar[i].second & cigar_type_mask); + } + return (uint8_t *)cigar_bytes; } -uint8_t* encode_bam_seq(char* seq) { - int n = (strlen(seq) + 1) >> 1 ; - int l_seq = strlen(seq) ; - uint8_t* seq_bytes = (uint8_t*) malloc(sizeof(uint8_t) * n) ; - int i = 0 ; - n = 0 ; - for (i = 0; i + 1 < l_seq; i += 2) { - seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; - n += 1 ; - } - for (; i < l_seq; i++) { - seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; - n += 1 ; - } - return seq_bytes ; +uint8_t *encode_bam_seq(char *seq) { + int n = (strlen(seq) + 1) >> 1; + int l_seq = strlen(seq); + uint8_t *seq_bytes = (uint8_t *)malloc(sizeof(uint8_t) * n); + int i = 0; + n = 0; + for (i = 0; i + 1 < l_seq; i += 2) { + seq_bytes[n] = (seq_nt16_table[(unsigned char)seq[i]] << 4) | + seq_nt16_table[(unsigned char)seq[i + 1]]; + n += 1; + } + for (; i < l_seq; i++) { + seq_bytes[n] = seq_nt16_table[(unsigned char)seq[i]] << 4; + n += 1; + } + return seq_bytes; } char reverse_complement_base(char base) { - if (base == 'C' || base == 'c') { - return 'G' ; - } - if (base == 'A' || base == 'a') { - return 'T' ; - } - if (base == 'G' || base == 'g') { - return 'C' ; - } - if (base == 'T' || base == 't') { - return 'A' ; - } - else { - return 'N' ; - } + if (base == 'C' || base == 'c') { + return 'G'; + } + if (base == 'A' || base == 'a') { + return 'T'; + } + if (base == 'G' || base == 'g') { + return 'C'; + } + if (base == 'T' || base == 't') { + return 'A'; + } else { + return 'N'; + } } -void reverse_complement_read(char* seq) { - int l = strlen(seq) ; - int i = 0 ; - while (i < l / 2) { - auto t = reverse_complement_base(seq[l - i]) ; - seq[l - 1 - i] = reverse_complement_base(seq[i]) ; - seq[i] = t ; - i += 1 ; - } +void reverse_complement_read(char *seq) { + int l = strlen(seq); + int i = 0; + while (i < l / 2) { + auto t = reverse_complement_base(seq[l - i]); + seq[l - 1 - i] = reverse_complement_base(seq[i]); + seq[i] = t; + i += 1; + } } vector> get_aligned_pairs(bam1_t *alignment) { - vector> result ; - uint ref_pos = alignment->core.pos ; - uint read_pos = 0 ; - auto cigar_offsets = decode_cigar(alignment) ; - int m = 0 ; - while (true) { - if (m == cigar_offsets.size()) { - break ; - } - if (cigar_offsets[m].second == BAM_CMATCH or cigar_offsets[m].second == BAM_CEQUAL or cigar_offsets[m].second == BAM_CDIFF) { - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { - result.push_back(make_pair(read_pos, i)); - read_pos++; - } - ref_pos += cigar_offsets[m].first; - } else if (cigar_offsets[m].second == BAM_CINS or cigar_offsets[m].second == BAM_CSOFT_CLIP) { - for (uint i = 0; i < cigar_offsets[m].first; ++i) { - result.push_back(make_pair(read_pos, -1)); - read_pos++; - } - } else if (cigar_offsets[m].second == BAM_CDEL) { - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { - result.push_back(make_pair(-1, i)); - } - ref_pos += cigar_offsets[m].first; - } else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { - // advances neither - } else if (cigar_offsets[m].second == BAM_CREF_SKIP) { - for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { - result.push_back(make_pair(-1, i)); - } - ref_pos += cigar_offsets[m].first; - } else { //if (cigar_offsets[m].second == BAM_CPAD) { - //TODO - } - m++ ; - } - return result; + vector> result; + uint ref_pos = alignment->core.pos; + uint read_pos = 0; + auto cigar_offsets = decode_cigar(alignment); + int m = 0; + while (true) { + if (m == cigar_offsets.size()) { + break; + } + if (cigar_offsets[m].second == BAM_CMATCH or + cigar_offsets[m].second == BAM_CEQUAL or + cigar_offsets[m].second == BAM_CDIFF) { + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { + result.push_back(make_pair(read_pos, i)); + read_pos++; + } + ref_pos += cigar_offsets[m].first; + } else if (cigar_offsets[m].second == BAM_CINS or + cigar_offsets[m].second == BAM_CSOFT_CLIP) { + for (uint i = 0; i < cigar_offsets[m].first; ++i) { + result.push_back(make_pair(read_pos, -1)); + read_pos++; + } + } else if (cigar_offsets[m].second == BAM_CDEL) { + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { + result.push_back(make_pair(-1, i)); + } + ref_pos += cigar_offsets[m].first; + } else if (cigar_offsets[m].second == BAM_CHARD_CLIP) { + // advances neither + } else if (cigar_offsets[m].second == BAM_CREF_SKIP) { + for (uint i = ref_pos; i < ref_pos + cigar_offsets[m].first; ++i) { + result.push_back(make_pair(-1, i)); + } + ref_pos += cigar_offsets[m].first; + } else { // if (cigar_offsets[m].second == BAM_CPAD) { + // TODO + } + m++; + } + return result; } - diff --git a/bam.hpp b/bam.hpp index 9f59078..e859f36 100644 --- a/bam.hpp +++ b/bam.hpp @@ -1,140 +1,140 @@ #ifndef BAM_HPP #define BAM_HPP -#include -#include -#include #include -#include -#include #include +#include +#include +#include #include +#include +#include -#include -#include #include #include +#include #include +#include -extern uint32_t cigar_len_mask ; -extern uint32_t cigar_type_mask ; +extern uint32_t cigar_len_mask; +extern uint32_t cigar_type_mask; -// CHECKME: I changed this struct but I didn't check if I broke something in realignment +// CHECKME: I changed this struct but I didn't check if I broke something in +// realignment struct CIGAR { - std::vector> ops; - int mismatches; - uint ngaps ; - uint start ; - int score ; - - CIGAR() { - mismatches = 0 ; - ngaps = -1 ; - score = -1 ; + std::vector> ops; + int mismatches; + uint ngaps; + uint start; + int score; + + CIGAR() { + mismatches = 0; + ngaps = -1; + score = -1; + } + + CIGAR(std::vector> ops_, int score_, uint start_ = 0) { + mismatches = -1; + score = score_; + ops = ops_; + ngaps = 0; + start = start_; + for (uint i = 0; i < ops_.size(); ++i) { + ngaps += ((ops_[i].second == 'I' || ops_[i].second == 'D') ? 1 : 0); } - - CIGAR(std::vector> ops_, int score_, uint start_ = 0) { - mismatches = -1 ; - score = score_ ; - ops = ops_ ; - ngaps = 0 ; - start = start_ ; - for (uint i = 0; i < ops_.size(); ++i) { - ngaps += ((ops_[i].second == 'I' || ops_[i].second == 'D') ? 1 : 0) ; - } + } + + void parse_cigar(char *cigar) { + int b = 0; + for (int i = 0; i < strlen(cigar); i++) { + if (isdigit(cigar[i])) { + continue; + } else { + char type = cigar[i]; + cigar[i] = '\0'; + int l = std::stoi(std::string(cigar + b)); + ops.push_back(std::make_pair(l, type)); + cigar[i] = type; + b = i + 1; + } } - - void parse_cigar(char* cigar) { - int b = 0 ; - for (int i = 0; i < strlen(cigar); i++) { - if (isdigit(cigar[i])) { - continue ; - } else { - char type = cigar[i] ; - cigar[i] = '\0' ; - int l = std::stoi(std::string(cigar + b)) ; - ops.push_back(std::make_pair(l, type)) ; - cigar[i] = type ; - b = i + 1 ; - } - } + } + + CIGAR(char *cigar, int score_, int start_ = 0) { + mismatches = -1; + score = score_; + start = start_; + parse_cigar(cigar); + ngaps = 0; + for (uint i = 0; i < ops.size(); ++i) { + ngaps += ((ops[i].second == 'I' || ops[i].second == 'D') ? 1 : 0); } - - CIGAR(char* cigar, int score_, int start_ = 0) { - mismatches = -1 ; - score = score_ ; - start = start_ ; - parse_cigar(cigar) ; - ngaps = 0 ; - for (uint i = 0; i < ops.size(); ++i) { - ngaps += ((ops[i].second == 'I' || ops[i].second == 'D') ? 1 : 0); - } + } + + void add(int l, char op, int e) { + mismatches += e; + if (ops.empty() || ops.back().second != op) { + ops.push_back(std::make_pair(l, op)); + } else { + ops.back().first += l; } + } - void add(int l, char op, int e) { - mismatches += e; - if (ops.empty() || ops.back().second != op) { - ops.push_back(std::make_pair(l, op)); - } else { - ops.back().first += l; - } - } + void add_front(int l) { + ops.front().first += l; + ops.insert(ops.begin(), std::make_pair(1, 'M')); + } - void add_front(int l) { - ops.front().first += l; - ops.insert(ops.begin(), std::make_pair(1, 'M')); + void print() { + for (uint i = 0; i < ops.size(); ++i) { + std::cout << ops[i].first << ops[i].second; } - - void print() { - for (uint i = 0; i < ops.size(); ++i) { - std::cout << ops[i].first << ops[i].second ; - } - std::cout << std::endl ; + std::cout << std::endl; + } + + void fixclips() { + if (ops.front().second != 'M') { + if (ops.front().second == 'I') { + ops.front().second = 'S'; + } + if (ops.front().second == 'D') { + ops.erase(ops.begin()); // CHECKME: by doing this we may reduce the + // length of merged variations + --ngaps; + } + } else if (ops.back().second != 'M') { + if (ops.back().second == 'I') { + ops.back().second = 'S'; + } + if (ops.back().second == 'D') { + ops.pop_back(); // CHECKME: by doing this we may reduce the length of + // merged variations + --ngaps; + } } + } - void fixclips() { - if (ops.front().second != 'M') { - if (ops.front().second == 'I') { - ops.front().second = 'S'; - } - if (ops.front().second == 'D') { - ops.erase(ops.begin()) ; // CHECKME: by doing this we may reduce the length of merged variations - --ngaps ; - } - } - else if (ops.back().second != 'M') { - if (ops.back().second == 'I') { - ops.back().second = 'S'; - } - if (ops.back().second == 'D') { - ops.pop_back(); // CHECKME: by doing this we may reduce the length of merged variations - --ngaps ; - } - } - } + const std::pair &operator[](std::size_t i) const { + return ops[i]; + } - const std::pair &operator[](std::size_t i) const { - return ops[i]; - } - - uint size() const { - return ops.size(); - } + uint size() const { return ops.size(); } - std::string to_str() const { - std::string cigar_str; - for (const auto &op : ops) { - cigar_str += std::to_string(op.first) + op.second; - } - return cigar_str; + std::string to_str() const { + std::string cigar_str; + for (const auto &op : ops) { + cigar_str += std::to_string(op.first) + op.second; } + return cigar_str; + } }; -std::string print_cigar_symbol(int type) ; -std::vector> decode_cigar(bam1_t* read) ; -uint8_t* encode_cigar(std::vector> cigar) ; -uint8_t* encode_bam_seq(char* seq) ; -char reverse_complement_base(char base) ; -std::vector> get_aligned_pairs(bam1_t *alignment) ; +std::string print_cigar_symbol(int type); +std::vector> decode_cigar(bam1_t *read); +uint8_t *encode_cigar(std::vector> cigar); +uint8_t *encode_bam_seq(char *seq); +char reverse_complement_base(char base); +std::vector> get_aligned_pairs(bam1_t *alignment); #endif diff --git a/bed_utils.cpp b/bed_utils.cpp index 1a8d5e3..dad4da6 100644 --- a/bed_utils.cpp +++ b/bed_utils.cpp @@ -1,56 +1,60 @@ #include "bed_utils.hpp" bool track_comparator(Track a, Track b) { - if (a.chrom[3] == b.chrom[3]) { - return a.begin < b.end ; - } else { - return a.chrom[3] < b.chrom[3] ; - } + if (a.chrom[3] == b.chrom[3]) { + return a.begin < b.end; + } else { + return a.chrom[3] < b.chrom[3]; + } } string Track::get_name() const { - return svtype + "@" + chrom + "_" + std::to_string(begin) + "_" + std::to_string(end) ; + return svtype + "@" + chrom + "_" + std::to_string(begin) + "_" + + std::to_string(end); } std::vector load_tracks_from_file(string path) { lprint({"Parsing BED file:", path, ".."}); - std::ifstream bed_file(path) ; - std::string line ; - int i = 0 ; - std::vector tracks ; - unordered_map header ; - while (std::getline(bed_file, line)) { - istringstream iss(line) ; - if (i == 0) { - if (line[0] != '#') { - lprint({"BED header not present (first line doesn't begin with #). Aborting.."}, 2); - } - vector tokens{istream_iterator{iss}, istream_iterator{}} ; - for (int i = 0; i < tokens.size(); i++) { - header[tokens[i]] = i ; - } - i += 1 ; - } else { - vector tokens{istream_iterator{iss}, istream_iterator{}} ; - Track track ; - track.chrom = tokens[0] ; - track.begin = std::stoi(tokens[1]) ; - track.end = std::stoi(tokens[2]) ; - track.svtype = tokens[3] ; - track.svlen = std::stoi(tokens[4]) ; - tracks.push_back(track) ; - } + std::ifstream bed_file(path); + std::string line; + int i = 0; + std::vector tracks; + unordered_map header; + while (std::getline(bed_file, line)) { + istringstream iss(line); + if (i == 0) { + if (line[0] != '#') { + lprint({"BED header not present (first line doesn't begin with #). " + "Aborting.."}, + 2); + } + vector tokens{istream_iterator{iss}, + istream_iterator{}}; + for (int i = 0; i < tokens.size(); i++) { + header[tokens[i]] = i; + } + i += 1; + } else { + vector tokens{istream_iterator{iss}, + istream_iterator{}}; + Track track; + track.chrom = tokens[0]; + track.begin = std::stoi(tokens[1]); + track.end = std::stoi(tokens[2]); + track.svtype = tokens[3]; + track.svlen = std::stoi(tokens[4]); + tracks.push_back(track); } - lprint({"Loaded", to_string(tracks.size()), "tracks."}); - return tracks ; + } + lprint({"Loaded", to_string(tracks.size()), "tracks."}); + return tracks; } std::unordered_map load_tracks_from_file_as_dict(string path) { - auto _tracks = load_tracks_from_file(path) ; - std::unordered_map tracks ; - for (auto track = _tracks.begin(); track != _tracks.end(); track++) { - tracks[*track] = 1 ; - } - return tracks ; + auto _tracks = load_tracks_from_file(path); + std::unordered_map tracks; + for (auto track = _tracks.begin(); track != _tracks.end(); track++) { + tracks[*track] = 1; + } + return tracks; } - diff --git a/bed_utils.hpp b/bed_utils.hpp index cd3bb4e..a5dd18f 100644 --- a/bed_utils.hpp +++ b/bed_utils.hpp @@ -1,59 +1,58 @@ #ifndef BED_HPP #define BED_HPP -#include -#include +#include +#include #include -#include #include #include +#include #include -#include -#include +#include #include +#include #include "lprint.hpp" -using namespace std ; +using namespace std; struct Locus { - std::string chrom ; - int position ; - int count ; -} ; + std::string chrom; + int position; + int count; +}; struct Track { - std::string chrom ; - uint64_t begin ; - uint64_t end ; - std::string svtype ; - int svlen ; + std::string chrom; + uint64_t begin; + uint64_t end; + std::string svtype; + int svlen; - bool operator==(const Track &o) const { - return chrom == o.chrom && begin == o.begin && end == o.end && svtype == o.svtype ; - } + bool operator==(const Track &o) const { + return chrom == o.chrom && begin == o.begin && end == o.end && + svtype == o.svtype; + } - bool operator!=(const Track &o) const { - return !(*this == o) ; - } + bool operator!=(const Track &o) const { return !(*this == o); } - std::string get_name() const ; + std::string get_name() const; }; namespace std { - template <> struct hash { - std::size_t operator()(const Track& t) const { - uint64_t hash = 0 ; - hash += t.begin ; - hash += t.end << 32 ; - return std::hash()(hash) ; - } - }; -} - -bool track_comparator(Track a, Track b) ; - -std::vector load_tracks_from_file(std::string path) ; -std::unordered_map load_tracks_from_file_as_dict(std::string path) ; +template <> struct hash { + std::size_t operator()(const Track &t) const { + uint64_t hash = 0; + hash += t.begin; + hash += t.end << 32; + return std::hash()(hash); + } +}; +} // namespace std + +bool track_comparator(Track a, Track b); + +std::vector load_tracks_from_file(std::string path); +std::unordered_map load_tracks_from_file_as_dict(std::string path); #endif diff --git a/caller.cpp b/caller.cpp index 9a5d868..95eb25f 100644 --- a/caller.cpp +++ b/caller.cpp @@ -2,140 +2,170 @@ #include "caller.hpp" -using namespace std ; +using namespace std; void Caller::run() { - config = Configuration::getInstance(); - lprint({"PingPong SV Caller running on", to_string(config->threads), "threads.."}) ; - // load reference genome and SFS - load_chromosomes(config->reference) ; - lprint({"Loaded all chromosomes."}) ; - load_input_sfs() ; - // call SVs from extended SFS - vector svs ; - Extender extender = Extender(&SFSs) ; - extender.run(config->threads) ; - svs.insert(svs.begin(), extender.svs.begin(), extender.svs.end()) ; - lprint({"Predicted", to_string(svs.size()), "SVs from extended SFS."}) ; - interval_tree_t vartree ; - for (const auto& sv: svs) { - vartree.insert({sv.s - 1000, sv.e + 1000}) ; + config = Configuration::getInstance(); + lprint({"PingPong SV Caller running on", to_string(config->threads), + "threads.."}); + // load reference genome and SFS + load_chromosomes(config->reference); + lprint({"Loaded all chromosomes."}); + load_input_sfs(); + // call SVs from extended SFS + vector svs; + Extender extender = Extender(&SFSs); + extender.run(config->threads); + svs.insert(svs.begin(), extender.svs.begin(), extender.svs.end()); + lprint({"Predicted", to_string(svs.size()), "SVs from extended SFS."}); + interval_tree_t vartree; + for (const auto &sv : svs) { + vartree.insert({sv.s - 1000, sv.e + 1000}); + } + std::sort(svs.begin(), svs.end()); + // output POA alignments SAM + string poa_path = config->workdir + "/poa.sam"; + lprint({"Outputting POA alignments to", poa_path + ".."}); + osam.open(poa_path); + osam << "@HD\tVN:1.4" << endl; + for (int i = 0; i < chromosomes.size(); ++i) { + osam << "@SQ\tSN:" << chromosomes[i] << "\t" + << "LN:" << strlen(chromosome_seqs[chromosomes[i]]) << endl; + } + for (int j = 0; j < extender.alignments.size(); j++) { + const auto &c = extender.alignments[j]; + osam << c.chrom << ":" << c.s + 1 << "-" << c.e + 1 << "\t" + << "0" + << "\t" << c.chrom << "\t" << c.s + 1 << "\t" + << "60" + << "\t" << c.cigar << "\t" + << "*" + << "\t" + << "0" + << "\t" + << "0" + << "\t" << c.seq << "\t" + << "*" << endl; + } + osam.close(); + // output SV calls + string vcf_path = config->workdir + "/svs_poa.vcf"; + lprint({"Exporting", to_string(svs.size()), "SV calls to", vcf_path + ".."}); + ovcf.open(vcf_path); + print_vcf_header(); + for (const SV &sv : svs) { + ovcf << sv << endl; + } + ovcf.close(); + // + if (config->clipped) { + vector clipped_svs; + Clipper clipper(extender.clips); + clipper.call(config->threads, vartree); + int s = 0; + for (int i = 0; i < config->threads; i++) { + s += clipper._p_svs[i].size(); + clipped_svs.insert(svs.begin(), clipper._p_svs[i].begin(), + clipper._p_svs[i].end()); } - std::sort(svs.begin(), svs.end()) ; - // output POA alignments SAM - string poa_path = config->workdir + "/poa.sam" ; - lprint({"Outputting POA alignments to", poa_path + ".."}) ; - osam.open(poa_path) ; - osam << "@HD\tVN:1.4" << endl; - for (int i = 0; i < chromosomes.size(); ++i) { - osam << "@SQ\tSN:" << chromosomes[i] << "\t" << "LN:" << strlen(chromosome_seqs[chromosomes[i]]) << endl ; - } - for (int j = 0; j < extender.alignments.size(); j++) { - const auto& c = extender.alignments[j] ; - osam << c.chrom << ":" << c.s + 1 << "-" << c.e + 1 << "\t" - << "0" - << "\t" << c.chrom << "\t" << c.s + 1 << "\t" - << "60" - << "\t" << c.cigar << "\t" - << "*" - << "\t" - << "0" - << "\t" - << "0" - << "\t" << c.seq << "\t" - << "*" << endl ; - } - osam.close() ; - // output SV calls - string vcf_path = config->workdir + "/svs_poa.vcf" ; - lprint({"Exporting", to_string(svs.size()), "SV calls to", vcf_path + ".."}) ; - ovcf.open(vcf_path) ; - print_vcf_header() ; - for (const SV& sv: svs) { - ovcf << sv << endl ; - } - ovcf.close() ; - // - if (config->clipped) { - vector clipped_svs ; - Clipper clipper(extender.clips); - clipper.call(config->threads, vartree) ; - int s = 0 ; - for (int i = 0; i < config->threads; i++) { - s += clipper._p_svs[i].size() ; - clipped_svs.insert(svs.begin(), clipper._p_svs[i].begin(), clipper._p_svs[i].end()) ; - } - lprint({"Predicted", to_string(s), "SVs from clipped SFS."}) ; - string vcf_path = config->workdir + "/svs_clipped.vcf" ; - lprint({"Exporting", to_string(clipped_svs.size()), "SV calls to", vcf_path + ".."}) ; - ovcf.open(vcf_path) ; - print_vcf_header() ; - for (const SV& sv: clipped_svs) { - ovcf << sv << endl ; - } - ovcf.close() ; + lprint({"Predicted", to_string(s), "SVs from clipped SFS."}); + string vcf_path = config->workdir + "/svs_clipped.vcf"; + lprint({"Exporting", to_string(clipped_svs.size()), "SV calls to", + vcf_path + ".."}); + ovcf.open(vcf_path); + print_vcf_header(); + for (const SV &sv : clipped_svs) { + ovcf << sv << endl; } + ovcf.close(); + } } void Caller::load_input_sfs() { - int threads = config->threads ; - int num_batches = config->aggregate_batches ; - int num_threads = num_batches < threads ? num_batches : threads ; - vector>> _SFSs(num_batches) ; - lprint({"Loading assmbled SFS.."}) ; - #pragma omp parallel for num_threads(num_threads) - for (int j = 0; j < num_batches; j++) { - string s_j = std::to_string(j) ; - string inpath = config->workdir + "/solution_batch_" + s_j + ".assembled.sfs" ; - //cout << "[I] Loading SFS from " << inpath << endl ; - ifstream inf(inpath) ; - string line ; - if (inf.is_open()) { - string info[4]; - string read_name; - while (getline(inf, line)) { - stringstream ssin(line); - int i = 0; - while (ssin.good() && i < 4) { - ssin >> info[i++]; - } - if (info[0].compare("*") != 0) { - read_name = info[0]; - _SFSs[j][read_name] = vector(); - } - _SFSs[j][read_name].push_back(SFS(stoi(info[1]), stoi(info[2]), stoi(info[3]), true)) ; - } + int threads = config->threads; + int num_batches = config->aggregate_batches; + int num_threads = num_batches < threads ? num_batches : threads; + vector>> _SFSs(num_batches); + lprint({"Loading assmbled SFS.."}); +#pragma omp parallel for num_threads(num_threads) + for (int j = 0; j < num_batches; j++) { + string s_j = std::to_string(j); + string inpath = + config->workdir + "/solution_batch_" + s_j + ".assembled.sfs"; + // cout << "[I] Loading SFS from " << inpath << endl ; + ifstream inf(inpath); + string line; + if (inf.is_open()) { + string info[4]; + string read_name; + while (getline(inf, line)) { + stringstream ssin(line); + int i = 0; + while (ssin.good() && i < 4) { + ssin >> info[i++]; } - } - int r = 0 ; - int c = 0 ; - for (int j = 0; j < num_batches; j++) { - //lprint({"Batch", to_string(j), "with", to_string(_SFSs[j].size()), "strings."}); - r += _SFSs[j].size() ; - SFSs.insert(_SFSs[j].begin(), _SFSs[j].end()) ; - for (auto& read: _SFSs[j]) { - c += read.second.size() ; + if (info[0].compare("*") != 0) { + read_name = info[0]; + _SFSs[j][read_name] = vector(); } + _SFSs[j][read_name].push_back( + SFS(stoi(info[1]), stoi(info[2]), stoi(info[3]), true)); + } } - lprint({"Loaded", to_string(c), "SFS strings on", to_string(r), "reads."}) ; + } + int r = 0; + int c = 0; + for (int j = 0; j < num_batches; j++) { + // lprint({"Batch", to_string(j), "with", to_string(_SFSs[j].size()), + // "strings."}); + r += _SFSs[j].size(); + SFSs.insert(_SFSs[j].begin(), _SFSs[j].end()); + for (auto &read : _SFSs[j]) { + c += read.second.size(); + } + } + lprint({"Loaded", to_string(c), "SFS strings on", to_string(r), "reads."}); } void Caller::print_vcf_header() { - ovcf << "##fileformat=VCFv4.2" << endl; - ovcf << "##reference=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/technical/reference/20200513_hg38_NoALT/hg38.no_alt.fa.gz" << endl; - for (int i = 0; i < chromosomes.size(); ++i) { - ovcf << "##contig=" << endl; - } - ovcf << "##FILTER=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##INFO=" << endl; - ovcf << "##FORMAT=" << endl; - ovcf << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDEFAULT" << endl; + ovcf << "##fileformat=VCFv4.2" << endl; + ovcf << "##reference=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/" + "data_collections/HGSVC2/technical/reference/20200513_hg38_NoALT/" + "hg38.no_alt.fa.gz" + << endl; + for (int i = 0; i < chromosomes.size(); ++i) { + ovcf << "##contig=" + << endl; + } + ovcf << "##FILTER=" << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##INFO=" + << endl; + ovcf << "##FORMAT=" + << endl; + ovcf << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tDEFAULT" + << endl; } diff --git a/caller.hpp b/caller.hpp index b85a4b4..7466dcb 100644 --- a/caller.hpp +++ b/caller.hpp @@ -3,29 +3,29 @@ #include -#include "htslib/sam.h" #include "htslib/hts.h" +#include "htslib/sam.h" -#include "sv.hpp" -#include "vcf.hpp" -#include "config.hpp" +#include "chromosomes.hpp" #include "clipper.hpp" +#include "config.hpp" #include "extender.hpp" -#include "chromosomes.hpp" +#include "sv.hpp" +#include "vcf.hpp" class Caller { public: - void run(); + void run(); private: - Configuration* config; - std::unordered_map> SFSs ; + Configuration *config; + std::unordered_map> SFSs; - ofstream ovcf; - ofstream osam; - void load_input_sfs() ; - void print_vcf_header() ; + ofstream ovcf; + ofstream osam; + void load_input_sfs(); + void print_vcf_header(); }; #endif diff --git a/chromosomes.cpp b/chromosomes.cpp index dfcd0e9..59cc4d7 100644 --- a/chromosomes.cpp +++ b/chromosomes.cpp @@ -2,81 +2,81 @@ KSEQ_INIT(gzFile, gzread) -std::vector chromosomes ; -std::unordered_map chromosome_seqs ; +std::vector chromosomes; +std::unordered_map chromosome_seqs; string reverse_complement(string s) { - string rc(s) ; - int l = s.length() ; - for (int i = 0; i < l; i++) { - if (s[i] == 'A') { - rc[l - 1 - i] = 'T' ; - } - if (s[i] == 'C') { - rc[l - 1 - i] = 'G' ; - } - if (s[i] == 'G') { - rc[l - 1 - i] = 'C' ; - } - if (s[i] == 'T') { - rc[l - 1 - i] = 'A' ; - } + string rc(s); + int l = s.length(); + for (int i = 0; i < l; i++) { + if (s[i] == 'A') { + rc[l - 1 - i] = 'T'; } - return rc ; + if (s[i] == 'C') { + rc[l - 1 - i] = 'G'; + } + if (s[i] == 'G') { + rc[l - 1 - i] = 'C'; + } + if (s[i] == 'T') { + rc[l - 1 - i] = 'A'; + } + } + return rc; } string canonicalize(string s) { - auto rc = reverse_complement(s) ; - return rc < s ? rc : s ; + auto rc = reverse_complement(s); + return rc < s ? rc : s; } int get_reference_size(ifstream &fasta_file) { - fasta_file.seekg(0, ios_base::end) ; - int l = fasta_file.tellg() ; - fasta_file.seekg(0, ios_base::beg) ; - return l ; + fasta_file.seekg(0, ios_base::end); + int l = fasta_file.tellg(); + fasta_file.seekg(0, ios_base::beg); + return l; } void load_chromosomes(string path) { - lprint({"Loading reference genome from", path + ".."}); - gzFile fp = gzopen(path.c_str(), "r"); - kseq_t *seq = kseq_init(fp); - int l; - while ((l = kseq_read(seq)) >= 0) { - string name(seq->name.s) ; - // if (name.find('_') == -1) { - lprint({"Extracted", seq->name.s, "with", to_string(l), "bases."}); - for (uint i = 0; i < l; i++) { - seq->seq.s[i] = toupper(seq->seq.s[i]) ; - } - chromosomes.push_back(seq->name.s) ; - char* s = (char*) malloc(sizeof(char) * (l + 1)) ; - memcpy(s, seq->seq.s, l + 1) ; - s[l] = '\0' ; - chromosome_seqs[seq->name.s] = s; - // } + lprint({"Loading reference genome from", path + ".."}); + gzFile fp = gzopen(path.c_str(), "r"); + kseq_t *seq = kseq_init(fp); + int l; + while ((l = kseq_read(seq)) >= 0) { + string name(seq->name.s); + // if (name.find('_') == -1) { + lprint({"Extracted", seq->name.s, "with", to_string(l), "bases."}); + for (uint i = 0; i < l; i++) { + seq->seq.s[i] = toupper(seq->seq.s[i]); } - kseq_destroy(seq); - gzclose(fp); + chromosomes.push_back(seq->name.s); + char *s = (char *)malloc(sizeof(char) * (l + 1)); + memcpy(s, seq->seq.s, l + 1); + s[l] = '\0'; + chromosome_seqs[seq->name.s] = s; + // } + } + kseq_destroy(seq); + gzclose(fp); } string load_chromosome(string path) { - lprint({"Loading chromosome genome from", path, ".."}) ; - gzFile fp = gzopen(path.c_str(), "r") ; - kseq_t *seq = kseq_init(fp) ; - int l ; - string chrom ; - while ((l = kseq_read(seq)) >= 0) { - lprint({"Extracted", seq->name.s, "with", to_string(l), "bases"}); - for (uint i = 0; i < l; i++) { - seq->seq.s[i] = toupper(seq->seq.s[i]) ; - } - char* s = (char*) malloc(sizeof(char) * (l + 1)) ; - memcpy(s, seq->seq.s, l + 1) ; - chrom = string(s) ; - s[l] = '\0' ; + lprint({"Loading chromosome genome from", path, ".."}); + gzFile fp = gzopen(path.c_str(), "r"); + kseq_t *seq = kseq_init(fp); + int l; + string chrom; + while ((l = kseq_read(seq)) >= 0) { + lprint({"Extracted", seq->name.s, "with", to_string(l), "bases"}); + for (uint i = 0; i < l; i++) { + seq->seq.s[i] = toupper(seq->seq.s[i]); } - kseq_destroy(seq) ; - gzclose(fp) ; - return chrom ; + char *s = (char *)malloc(sizeof(char) * (l + 1)); + memcpy(s, seq->seq.s, l + 1); + chrom = string(s); + s[l] = '\0'; + } + kseq_destroy(seq); + gzclose(fp); + return chrom; } diff --git a/chromosomes.hpp b/chromosomes.hpp index adc09d5..af6721b 100644 --- a/chromosomes.hpp +++ b/chromosomes.hpp @@ -1,27 +1,27 @@ #ifndef CHR_HPP #define CHR_HPP -#include -#include #include #include +#include #include +#include #include -#include "kseq.h" #include "config.hpp" +#include "kseq.h" #include "lprint.hpp" -using namespace std ; +using namespace std; -extern std::vector chromosomes ; -extern std::unordered_map chromosome_seqs ; +extern std::vector chromosomes; +extern std::unordered_map chromosome_seqs; -int get_reference_size(std::ifstream &fasta_file) ; -void load_chromosomes(std::string path) ; -std::string canonicalize(std::string) ; -std::string reverse_complement(std::string) ; -std::string load_chromosome(string path) ; +int get_reference_size(std::ifstream &fasta_file); +void load_chromosomes(std::string path); +std::string canonicalize(std::string); +std::string reverse_complement(std::string); +std::string load_chromosome(string path); #endif diff --git a/clipper.cpp b/clipper.cpp index 807ea20..076d555 100644 --- a/clipper.cpp +++ b/clipper.cpp @@ -1,217 +1,226 @@ -#include +#include #include "clipper.hpp" -using namespace std ; -using namespace lib_interval_tree ; +using namespace std; +using namespace lib_interval_tree; -Clipper::Clipper(const vector& _clips) { - clips = _clips ; -} +Clipper::Clipper(const vector &_clips) { clips = _clips; } vector Clipper::remove_duplicates(const vector &clips) { - vector unique_clips ; - unordered_map qnames ; - for (const Clip& clip: clips) { - if (qnames.find(clip.name) == qnames.end()) { - qnames[clip.name] = 0 ; - unique_clips.push_back(clip) ; - } + vector unique_clips; + unordered_map qnames; + for (const Clip &clip : clips) { + if (qnames.find(clip.name) == qnames.end()) { + qnames[clip.name] = 0; + unique_clips.push_back(clip); } - return unique_clips; + } + return unique_clips; } -vector Clipper::combine(const vector& clips) { - int threads = 4 ; - vector> _p_combined_clips ; - _p_combined_clips.resize(threads) ; - // we first cluster by breakpoints - unordered_map>> clips_dict; - for (const Clip& c: clips) { - clips_dict[c.chrom][c.p].push_back(c); - } - // we then merge - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int i = 0; i < chromosomes.size(); i++) { - int t = i % threads ; - const string& chrom = chromosomes[i] ; - for (auto it = clips_dict[chrom].begin(); it != clips_dict[chrom].end(); ++it) { - uint max_l = 0 ; - for (const Clip& c: it->second) { - if (c.l > max_l) { - max_l = c.l ; - } - } - Clip clip = Clip("", chrom, it->first, max_l, it->second.front().starting, it->second.size()) ; - _p_combined_clips[t].push_back(clip); - } - } - vector combined_clips ; - for (int i = 0; i < threads; i++) { - combined_clips.insert(combined_clips.begin(), _p_combined_clips[i].begin(), _p_combined_clips[i].end()) ; +vector Clipper::combine(const vector &clips) { + int threads = 4; + vector> _p_combined_clips; + _p_combined_clips.resize(threads); + // we first cluster by breakpoints + unordered_map>> clips_dict; + for (const Clip &c : clips) { + clips_dict[c.chrom][c.p].push_back(c); + } +// we then merge +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int i = 0; i < chromosomes.size(); i++) { + int t = i % threads; + const string &chrom = chromosomes[i]; + for (auto it = clips_dict[chrom].begin(); it != clips_dict[chrom].end(); + ++it) { + uint max_l = 0; + for (const Clip &c : it->second) { + if (c.l > max_l) { + max_l = c.l; + } + } + Clip clip = Clip("", chrom, it->first, max_l, it->second.front().starting, + it->second.size()); + _p_combined_clips[t].push_back(clip); } - return combined_clips ; + } + vector combined_clips; + for (int i = 0; i < threads; i++) { + combined_clips.insert(combined_clips.begin(), _p_combined_clips[i].begin(), + _p_combined_clips[i].end()); + } + return combined_clips; } -vector Clipper::filter_lowcovered(const vector &clips, const uint w) { - vector filtered_clips; - for (const Clip &c: clips) { - if (c.w >= w) { - filtered_clips.push_back(c); - } +vector Clipper::filter_lowcovered(const vector &clips, + const uint w) { + vector filtered_clips; + for (const Clip &c : clips) { + if (c.w >= w) { + filtered_clips.push_back(c); } - return filtered_clips; + } + return filtered_clips; } // Cluster clips by proximity // TODO: this might be too slow vector Clipper::cluster(const vector &clips, uint r) { - vector clusters; - map clusters_by_pos; - for (const Clip &c : clips) { - bool found = false; - for (map::iterator it = clusters_by_pos.begin(); it != clusters_by_pos.end(); ++it) { - if (it->first - r <= c.p && c.p <= it->first + r) { - found = true; - it->second.l = max(it->second.l, c.l); - it->second.w += c.w; - } - } - if (!found) { - clusters_by_pos[c.p] = c; - } + vector clusters; + map clusters_by_pos; + for (const Clip &c : clips) { + bool found = false; + for (map::iterator it = clusters_by_pos.begin(); + it != clusters_by_pos.end(); ++it) { + if (it->first - r <= c.p && c.p <= it->first + r) { + found = true; + it->second.l = max(it->second.l, c.l); + it->second.w += c.w; + } } - - for (map::iterator it = clusters_by_pos.begin(); it != clusters_by_pos.end(); ++it) { - clusters.push_back(it->second); + if (!found) { + clusters_by_pos[c.p] = c; } - return clusters; + } + + for (map::iterator it = clusters_by_pos.begin(); + it != clusters_by_pos.end(); ++it) { + clusters.push_back(it->second); + } + return clusters; } -vector Clipper::filter_tooclose_clips(const vector &clips, interval_tree_t& vartree) { - vector fclips; - for (const Clip& c: clips) { - if (vartree.overlap_find({c.p, c.p + 1}) == std::end(vartree)) { - fclips.push_back(c); - } +vector Clipper::filter_tooclose_clips(const vector &clips, + interval_tree_t &vartree) { + vector fclips; + for (const Clip &c : clips) { + if (vartree.overlap_find({c.p, c.p + 1}) == std::end(vartree)) { + fclips.push_back(c); } - return fclips; + } + return fclips; } // find smallest right that is larger than query -int binary_search(const vector& clips, int begin, int end, const Clip& query) { - //for (int i = 0; i < clips.size(); i++) { - // if (query.p < clips[i].p) { - // return i ; - // } - //} - //return -1 ; - if (begin > end || begin >= clips.size()) { - return -1 ; - } - int m = (begin + end) / 2 ; - if (clips[m].p == query.p) { - if (m + 1 < clips.size()) { - return m + 1 ; - } else { - return m ; - } - } else if (clips[m].p > query.p) { - if (m > 0 && clips[m - 1].p < query.p) { - return m ; - } - return binary_search(clips, begin, m - 1, query) ; +int binary_search(const vector &clips, int begin, int end, + const Clip &query) { + // for (int i = 0; i < clips.size(); i++) { + // if (query.p < clips[i].p) { + // return i ; + // } + // } + // return -1 ; + if (begin > end || begin >= clips.size()) { + return -1; + } + int m = (begin + end) / 2; + if (clips[m].p == query.p) { + if (m + 1 < clips.size()) { + return m + 1; } else { - return binary_search(clips, m + 1, end, query) ; + return m; + } + } else if (clips[m].p > query.p) { + if (m > 0 && clips[m - 1].p < query.p) { + return m; } + return binary_search(clips, begin, m - 1, query); + } else { + return binary_search(clips, m + 1, end, query); + } } -void Clipper::call(int threads, interval_tree_t& vartree) { - lprint({"Predicting SVS from", to_string(clips.size()), "clipped SFS on", to_string(threads), "threads.."}) ; - vector rclips; - vector lclips; - for (const Clip& clip: clips) { - if (clip.starting) { - lclips.push_back(clip); - } else { - rclips.push_back(clip); - } +void Clipper::call(int threads, interval_tree_t &vartree) { + lprint({"Predicting SVS from", to_string(clips.size()), "clipped SFS on", + to_string(threads), "threads.."}); + vector rclips; + vector lclips; + for (const Clip &clip : clips) { + if (clip.starting) { + lclips.push_back(clip); + } else { + rclips.push_back(clip); } - lprint({to_string(lclips.size()), "left clips."}) ; - lprint({to_string(rclips.size()), "right clips."}) ; - lprint({"Preprocessing clipped SFS.."}) ; - #pragma omp parallel for num_threads(2) schedule(static,1) - for (int i = 0; i < 2; i++) { - if (i == 0) { - rclips = remove_duplicates(rclips); - rclips = combine(rclips); - rclips = filter_lowcovered(rclips, 2); // FIXME: hardcoded - rclips = filter_tooclose_clips(rclips, vartree); - rclips = cluster(rclips, 1000); // FIXME: hardcoded - std::sort(rclips.begin(), rclips.end()) ; - } else { - lclips = remove_duplicates(lclips); - lclips = combine(lclips); - lclips = filter_lowcovered(lclips, 2); // FIXME: hardcoded - lclips = filter_tooclose_clips(lclips, vartree); - lclips = cluster(lclips, 1000); // FIXME: hardcoded - std::sort(lclips.begin(), lclips.end()) ; - } + } + lprint({to_string(lclips.size()), "left clips."}); + lprint({to_string(rclips.size()), "right clips."}); + lprint({"Preprocessing clipped SFS.."}); +#pragma omp parallel for num_threads(2) schedule(static, 1) + for (int i = 0; i < 2; i++) { + if (i == 0) { + rclips = remove_duplicates(rclips); + rclips = combine(rclips); + rclips = filter_lowcovered(rclips, 2); // FIXME: hardcoded + rclips = filter_tooclose_clips(rclips, vartree); + rclips = cluster(rclips, 1000); // FIXME: hardcoded + std::sort(rclips.begin(), rclips.end()); + } else { + lclips = remove_duplicates(lclips); + lclips = combine(lclips); + lclips = filter_lowcovered(lclips, 2); // FIXME: hardcoded + lclips = filter_tooclose_clips(lclips, vartree); + lclips = cluster(lclips, 1000); // FIXME: hardcoded + std::sort(lclips.begin(), lclips.end()); + } + } + lprint({to_string(lclips.size()), "left clips."}); + lprint({to_string(rclips.size()), "right clips."}); + _p_svs.resize(threads); + if (lclips.empty() || rclips.empty()) { + return; + } + lprint({"Predicting insertions.."}); +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int i = 0; i < lclips.size(); i++) { + const Clip &lc = lclips[i]; + int t = omp_get_thread_num(); + string chrom = lc.chrom; + // we get the closest right clip + int r = binary_search(rclips, 0, rclips.size() - 1, lc); + if (r == -1) { + continue; + } + auto rc = rclips[r]; + if (rc.w == 0) { + continue; } - lprint({to_string(lclips.size()), "left clips."}) ; - lprint({to_string(rclips.size()), "right clips."}) ; - _p_svs.resize(threads) ; - if (lclips.empty() || rclips.empty()) { - return ; - } - lprint({"Predicting insertions.."}) ; - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int i = 0; i < lclips.size(); i++) { - const Clip& lc = lclips[i] ; - int t = omp_get_thread_num() ; - string chrom = lc.chrom ; - // we get the closest right clip - int r = binary_search(rclips, 0, rclips.size() - 1, lc) ; - if (r == -1) { - continue ; - } - auto rc = rclips[r] ; - if (rc.w == 0) { - continue ; - } - if (abs((int)rc.p - (int)lc.p) < 1000) { - uint s = lc.w > rc.w ? lc.p : rc.p; - uint l = max(lc.l, rc.l); - string refbase(chromosome_seqs[chrom] + s, 1) ; - uint w = max(lc.w, rc.w); - _p_svs[t].push_back(SV("INS", chrom, s, refbase, "", w, 0, 0, 0, true, l)); - } + if (abs((int)rc.p - (int)lc.p) < 1000) { + uint s = lc.w > rc.w ? lc.p : rc.p; + uint l = max(lc.l, rc.l); + string refbase(chromosome_seqs[chrom] + s, 1); + uint w = max(lc.w, rc.w); + _p_svs[t].push_back( + SV("INS", chrom, s, refbase, "", w, 0, 0, 0, true, l)); + } + } + lprint({"Predicting deletions.."}); +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int i = 0; i < rclips.size(); i++) { + const Clip &rc = rclips[i]; + int t = omp_get_thread_num(); + string chrom = rc.chrom; + // we get the closest right clip + int l = binary_search(lclips, 0, lclips.size() - 1, rc); + if (l == -1) { + continue; + } + auto lc = lclips[l]; + if (lc.w == 0) { + continue; } - lprint({"Predicting deletions.."}) ; - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int i = 0; i < rclips.size(); i++) { - const Clip& rc = rclips[i] ; - int t = omp_get_thread_num() ; - string chrom = rc.chrom ; - // we get the closest right clip - int l = binary_search(lclips, 0, lclips.size() - 1, rc) ; - if (l == -1) { - continue ; - } - auto lc = lclips[l] ; - if (lc.w == 0) { - continue ; - } - if (lc.p - rc.p >= 2000 && lc.p - rc.p <= 50000) { - uint s = rc.p; - uint l = lc.p - rc.p + 1; - string refbase(chromosome_seqs[chrom] + s, 1) ; - uint w = max(lc.w, rc.w); - if (w >= 5) { - _p_svs[t].push_back(SV("DEL", chrom, s, refbase, "", w, 0, 0, 0, true, l)); - } - } + if (lc.p - rc.p >= 2000 && lc.p - rc.p <= 50000) { + uint s = rc.p; + uint l = lc.p - rc.p + 1; + string refbase(chromosome_seqs[chrom] + s, 1); + uint w = max(lc.w, rc.w); + if (w >= 5) { + _p_svs[t].push_back( + SV("DEL", chrom, s, refbase, "", w, 0, 0, 0, true, l)); + } } + } } diff --git a/clipper.hpp b/clipper.hpp index 1cadc2b..d5aceab 100644 --- a/clipper.hpp +++ b/clipper.hpp @@ -2,61 +2,60 @@ #define CLIPPER_HPP #include -#include #include -#include +#include #include #include +#include -#include "htslib/sam.h" #include "htslib/hts.h" +#include "htslib/sam.h" #include "interval_tree.hpp" -#include "sv.hpp" -#include "lprint.hpp" #include "chromosomes.hpp" +#include "lprint.hpp" +#include "sv.hpp" struct Clip { - std::string name ; - std::string chrom ; - uint p ; - uint l ; - bool starting ; - uint w ; - - Clip() { - w = 0; - } - - Clip(std::string name_, std::string chrom_, const uint p_, uint l_, bool starting_, uint w_ = 0) { - name = name_ ; - chrom = chrom_ ; - p = p_ ; - l = l_ ; - starting = starting_ ; - w = w_ ; - } - - bool operator<(const Clip &c) const { - return p < c.p; - } + std::string name; + std::string chrom; + uint p; + uint l; + bool starting; + uint w; + + Clip() { w = 0; } + + Clip(std::string name_, std::string chrom_, const uint p_, uint l_, + bool starting_, uint w_ = 0) { + name = name_; + chrom = chrom_; + p = p_; + l = l_; + starting = starting_; + w = w_; + } + + bool operator<(const Clip &c) const { return p < c.p; } }; class Clipper { private: - std::vector clips ; - std::vector remove_duplicates(const std::vector &); - std::vector combine(const std::vector &); - std::vector filter_lowcovered(const std::vector &, const uint); - std::vector cluster(const std::vector &, uint); - std::vector filter_tooclose_clips(const std::vector &, lib_interval_tree::interval_tree_t &); + std::vector clips; + std::vector remove_duplicates(const std::vector &); + std::vector combine(const std::vector &); + std::vector filter_lowcovered(const std::vector &, const uint); + std::vector cluster(const std::vector &, uint); + std::vector + filter_tooclose_clips(const std::vector &, + lib_interval_tree::interval_tree_t &); public: - std::vector> _p_svs ; + std::vector> _p_svs; - Clipper(const std::vector&) ; - void call(int threads, lib_interval_tree::interval_tree_t&) ; + Clipper(const std::vector &); + void call(int threads, lib_interval_tree::interval_tree_t &); }; #endif diff --git a/cluster.cpp b/cluster.cpp index 69bac9a..04c3105 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -1,76 +1,75 @@ #include "cluster.hpp" -using namespace std ; +using namespace std; // AaCcGgTtNn ==> 0,1,2,3,4 unsigned char _char26_table[256] = { - 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4 /*'-'*/, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, + 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; string Cluster::poa() { - // --- POA - uint n_seqs = seqs.size(); - abpoa_t *ab = abpoa_init(); - abpoa_para_t *abpt = abpoa_init_para(); - abpt->disable_seeding = 1; - abpt->align_mode = 0; // global - abpt->out_msa = 0; - abpt->out_cons = 1; - abpt->out_gfa = 0; - // abpt->w = 6, abpt->k = 9; - // abpt->min_w = 10; // minimizer-based seeding and partition - // abpt->is_diploid = 1; - // abpt->min_freq = 0.3; - abpt->progressive_poa = 0; - abpoa_post_set_para(abpt); + // --- POA + uint n_seqs = seqs.size(); + abpoa_t *ab = abpoa_init(); + abpoa_para_t *abpt = abpoa_init_para(); + abpt->disable_seeding = 1; + abpt->align_mode = 0; // global + abpt->out_msa = 0; + abpt->out_cons = 1; + abpt->out_gfa = 0; + // abpt->w = 6, abpt->k = 9; + // abpt->min_w = 10; // minimizer-based seeding and partition + // abpt->is_diploid = 1; + // abpt->min_freq = 0.3; + abpt->progressive_poa = 0; + abpoa_post_set_para(abpt); - int *seq_lens = (int *)malloc(sizeof(int) * n_seqs); - uint8_t **bseqs = (uint8_t **)malloc(sizeof(uint8_t *) * n_seqs); - for (uint i = 0; i < n_seqs; ++i) { - seq_lens[i] = seqs[i].size(); - bseqs[i] = (uint8_t *)malloc(sizeof(uint8_t) * seq_lens[i]); - for (int j = 0; j < seq_lens[i]; ++j) { - bseqs[i][j] = _char26_table[(int)seqs[i][j]]; - } + int *seq_lens = (int *)malloc(sizeof(int) * n_seqs); + uint8_t **bseqs = (uint8_t **)malloc(sizeof(uint8_t *) * n_seqs); + for (uint i = 0; i < n_seqs; ++i) { + seq_lens[i] = seqs[i].size(); + bseqs[i] = (uint8_t *)malloc(sizeof(uint8_t) * seq_lens[i]); + for (int j = 0; j < seq_lens[i]; ++j) { + bseqs[i][j] = _char26_table[(int)seqs[i][j]]; } + } - abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, NULL); + abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, NULL); - abpoa_cons_t *abc = ab->abc; + abpoa_cons_t *abc = ab->abc; - string cons = ""; - if (abc->n_cons > 0) { - for (int j = 0; j < abc->cons_len[0]; ++j) { - cons += "ACGTN"[abc->cons_base[0][j]]; - } + string cons = ""; + if (abc->n_cons > 0) { + for (int j = 0; j < abc->cons_len[0]; ++j) { + cons += "ACGTN"[abc->cons_base[0][j]]; } + } - for (uint i = 0; i < n_seqs; ++i) { free(bseqs[i]); } - free(bseqs); free(seq_lens); - abpoa_free(ab); abpoa_free_para(abpt); + for (uint i = 0; i < n_seqs; ++i) { + free(bseqs[i]); + } + free(bseqs); + free(seq_lens); + abpoa_free(ab); + abpoa_free_para(abpt); - return cons ; + return cons; } -void Cluster::dump(ofstream& o) const { - //o << chrom << " " << s << " " << e << " " << cov << " " << size(); - for (const string &seq: seqs) { - o << " " << seq; - } - o << endl ; +void Cluster::dump(ofstream &o) const { + // o << chrom << " " << s << " " << e << " " << cov << " " << size(); + for (const string &seq : seqs) { + o << " " << seq; + } + o << endl; } diff --git a/cluster.hpp b/cluster.hpp index dfa85bb..94821f7 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -1,71 +1,63 @@ #ifndef CLUSTER_HPP #define CLUSTER_HPP -#include -#include +#include #include #include -#include +#include +#include #include "abpoa.h" #include "sfs.hpp" struct Cluster { - std::string chrom ; - int s ; - int e ; - int cov ; - std::vector seqs ; - - Cluster(const std::string &chrom_, uint s_, uint e_, uint cov_ = 0) { - chrom = chrom_ ; - s = s_ ; - e = e_ ; - cov = cov_ ; - } - - void set_cov(uint cov_) { - cov = cov_ ; + std::string chrom; + int s; + int e; + int cov; + std::vector seqs; + + Cluster(const std::string &chrom_, uint s_, uint e_, uint cov_ = 0) { + chrom = chrom_; + s = s_; + e = e_; + cov = cov_; + } + + void set_cov(uint cov_) { cov = cov_; } + + void add(const std::string &seq) { seqs.push_back(seq); } + + int get_len() const { + uint l = 0; + uint n = 0; + for (const std::string &seq : seqs) { + ++n; + l += seq.size(); } + return l / n; + } - void add(const std::string &seq) { - seqs.push_back(seq); - } - - int get_len() const { - uint l = 0; - uint n = 0; - for (const std::string &seq : seqs) { - ++n; - l += seq.size(); - } - return l / n; - } + std::vector get_seqs() const { return seqs; } - std::vector get_seqs() const { - return seqs ; - } + uint size() const { return seqs.size(); } - uint size() const { - return seqs.size() ; - } + void dump(std::ofstream &o) const; - void dump(std::ofstream &o) const ; - - std::string poa() ; + std::string poa(); }; struct ExtCluster { - std::string chrom ; - std::vector seqs ; - - ExtCluster() {} ; - - ExtCluster(const std::vector& _seqs) { - seqs = _seqs ; - chrom = seqs[0].chrom ; - } + std::string chrom; + std::vector seqs; + + ExtCluster(){}; + + ExtCluster(const std::vector &_seqs) { + seqs = _seqs; + chrom = seqs[0].chrom; + } }; #endif diff --git a/config.cpp b/config.cpp index 549eb50..9384524 100644 --- a/config.cpp +++ b/config.cpp @@ -1,120 +1,118 @@ #include "config.hpp" -Configuration* Configuration::instance = nullptr ; +Configuration *Configuration::instance = nullptr; -Configuration* Configuration::getInstance() { - if (instance == nullptr) { - instance = new Configuration() ; - } - return instance ; +Configuration *Configuration::getInstance() { + if (instance == nullptr) { + instance = new Configuration(); + } + return instance; } -Configuration::Configuration() : - parser("SVDSS, Structural Variant Discovery from Sample-specific Strings") { - parser.add_options() - ("bed", "", cxxopts::value()) - ("bam", "", cxxopts::value()) - ("vcf", "", cxxopts::value()) - ("type", "", cxxopts::value()) - ("index", "", cxxopts::value()) - ("fasta", "", cxxopts::value()) - ("fastq", "", cxxopts::value()) - ("target", "", cxxopts::value()) - ("sfsbam", "", cxxopts::value()) - ("append", "", cxxopts::value()) - ("workdir", "", cxxopts::value()) - ("reference", "", cxxopts::value()) - ("cutoff", "", cxxopts::value()) - ("batches", "", cxxopts::value()) - ("overlap", "", cxxopts::value()) - ("coverage", "", cxxopts::value()) - ("t,threads", "", cxxopts::value()) - ("min-sv-length", "", cxxopts::value()) - ("min-cluster-weight", "", cxxopts::value()) - ("clipped", "", cxxopts::value()->default_value("false")) - ("assemble", "", cxxopts::value()->default_value("false")) - ("noputative", "", cxxopts::value()->default_value("false")) - ("b,binary", "", cxxopts::value()->default_value("false")) - ("aggregate", "", cxxopts::value()->default_value("false")) - ("selective", "", cxxopts::value()->default_value("true")) - ("version", "Print version information.") - ("help", "Print this help.") - ; +Configuration::Configuration() + : parser( + "SVDSS, Structural Variant Discovery from Sample-specific Strings") { + parser.add_options()("bed", "", cxxopts::value())( + "bam", "", cxxopts::value())("vcf", "", + cxxopts::value())( + "type", "", cxxopts::value())("index", "", + cxxopts::value())( + "fasta", "", cxxopts::value())( + "fastq", "", cxxopts::value())( + "target", "", cxxopts::value())( + "sfsbam", "", cxxopts::value())( + "append", "", cxxopts::value())( + "workdir", "", cxxopts::value())( + "reference", "", cxxopts::value())("cutoff", "", + cxxopts::value())( + "batches", "", cxxopts::value())("overlap", "", + cxxopts::value())( + "coverage", "", cxxopts::value())("t,threads", "", + cxxopts::value())( + "min-sv-length", "", cxxopts::value())("min-cluster-weight", "", + cxxopts::value())( + "clipped", "", cxxopts::value()->default_value("false"))( + "assemble", "", cxxopts::value()->default_value("false"))( + "noputative", "", cxxopts::value()->default_value("false"))( + "b,binary", "", cxxopts::value()->default_value("false"))( + "aggregate", "", cxxopts::value()->default_value("false"))( + "selective", "", cxxopts::value()->default_value("true"))( + "version", "Print version information.")("help", "Print this help."); } -void Configuration::parse(int argc, char** argv) { - auto results = parser.parse(argc, argv) ; - if (results.count("vcf")) { - vcf = results["vcf"].as() ; - } - bam = "" ; - if (results.count("bam")) { - bam = results["bam"].as() ; - } - if (results.count("sfsbam")) { - sfsbam = results["sfsbam"].as() ; - } - if (results.count("bed")) { - bed = results["bed"].as() ; - } - if (results.count("type")) { - type = results["type"].as() ; - } - if (results.count("index")) { - index = results["index"].as() ; - } - fastq = "" ; - if (results.count("fastq")) { - fastq = results["fastq"].as() ; - } - fasta = "" ; - if (results.count("fasta")) { - fasta = results["fasta"].as() ; - } - target = "" ; - if (results.count("target")) { - target = results["target"].as() ; - } - if (results.count("cutoff")) { - cutoff = results["cutoff"].as() ; - } - if (results.count("overlap")) { - overlap = results["overlap"].as() ; - } - if (results.count("threads")) { - threads = results["threads"].as() ; - } - if (results.count("workdir")) { - workdir = results["workdir"].as() ; - } else { - workdir = "." ; - } - if (results.count("append")) { - append = results["append"].as() ; - } else { - append = "" ; - } - if (results.count("coverage")) { - coverage = results["coverage"].as() ; - } - if (results.count("reference")) { - reference = results["reference"].as() ; - } - if (results.count("batches")) { - aggregate_batches = results["batches"].as() ; - } - if (results.count("min-sv-length")) { - min_sv_length = max(25, results["min-sv-length"].as()) ; - } - if (results.count("min-cluster-weight")) { - min_cluster_weight = results["min-cluster-weight"].as() ; - } - binary = results["binary"].as() ; - clipped = results["clipped"].as() ; - assemble = results["assemble"].as() ; - putative = !(results["noputative"].as()) ; - aggregate = results["aggregate"].as() ; - selective = results["selective"].as() ; - version = results["version"].as(); - help = results["help"].as(); +void Configuration::parse(int argc, char **argv) { + auto results = parser.parse(argc, argv); + if (results.count("vcf")) { + vcf = results["vcf"].as(); + } + bam = ""; + if (results.count("bam")) { + bam = results["bam"].as(); + } + if (results.count("sfsbam")) { + sfsbam = results["sfsbam"].as(); + } + if (results.count("bed")) { + bed = results["bed"].as(); + } + if (results.count("type")) { + type = results["type"].as(); + } + if (results.count("index")) { + index = results["index"].as(); + } + fastq = ""; + if (results.count("fastq")) { + fastq = results["fastq"].as(); + } + fasta = ""; + if (results.count("fasta")) { + fasta = results["fasta"].as(); + } + target = ""; + if (results.count("target")) { + target = results["target"].as(); + } + if (results.count("cutoff")) { + cutoff = results["cutoff"].as(); + } + if (results.count("overlap")) { + overlap = results["overlap"].as(); + } + if (results.count("threads")) { + threads = results["threads"].as(); + } + if (results.count("workdir")) { + workdir = results["workdir"].as(); + } else { + workdir = "."; + } + if (results.count("append")) { + append = results["append"].as(); + } else { + append = ""; + } + if (results.count("coverage")) { + coverage = results["coverage"].as(); + } + if (results.count("reference")) { + reference = results["reference"].as(); + } + if (results.count("batches")) { + aggregate_batches = results["batches"].as(); + } + if (results.count("min-sv-length")) { + min_sv_length = max(25, results["min-sv-length"].as()); + } + if (results.count("min-cluster-weight")) { + min_cluster_weight = results["min-cluster-weight"].as(); + } + binary = results["binary"].as(); + clipped = results["clipped"].as(); + assemble = results["assemble"].as(); + putative = !(results["noputative"].as()); + aggregate = results["aggregate"].as(); + selective = results["selective"].as(); + version = results["version"].as(); + help = results["help"].as(); } diff --git a/config.hpp b/config.hpp index 42e676d..fc6f94c 100644 --- a/config.hpp +++ b/config.hpp @@ -1,68 +1,67 @@ #ifndef CONFIG_HPP #define CONFIG_HPP +#include #include #include -#include -#include "cxxopts.hpp" #include "bed_utils.hpp" +#include "cxxopts.hpp" -using namespace std ; +using namespace std; class Configuration { private: - static Configuration* instance ; + static Configuration *instance; public: - static Configuration* getInstance() ; + static Configuration *getInstance(); - void parse(int argc, char* argv[]) ; + void parse(int argc, char *argv[]); - int cutoff = 0 ; - int overlap = -1 ; - int threads = 4 ; - int coverage = 50 ; - int batch_size = 1000 ; - int min_sv_length = 25 ; - int min_indel_length = 20 ; - int aggregate_batches = 5 ; - int min_cluster_weight = 2 ; + int cutoff = 0; + int overlap = -1; + int threads = 4; + int coverage = 50; + int batch_size = 1000; + int min_sv_length = 25; + int min_indel_length = 20; + int aggregate_batches = 5; + int min_cluster_weight = 2; - bool binary = false ; - bool clipped = false ; - bool putative = true ; - bool assemble = false ; - bool aggregate = false ; - bool selective = true ; - bool version = false; - bool help = false; + bool binary = false; + bool clipped = false; + bool putative = true; + bool assemble = false; + bool aggregate = false; + bool selective = true; + bool version = false; + bool help = false; - std::string bed ; - std::string bam ; // reads bam (reconstructed or not) - std::string sfsbam ; // superstrings bam (from realignment) - std::string vcf ; - std::string type ; - std::string workdir ; - std::string append ; - std::string index ; - std::string fastq ; - std::string fasta ; - std::string target ; - std::string prefix ; - std::string reference ; + std::string bed; + std::string bam; // reads bam (reconstructed or not) + std::string sfsbam; // superstrings bam (from realignment) + std::string vcf; + std::string type; + std::string workdir; + std::string append; + std::string index; + std::string fastq; + std::string fasta; + std::string target; + std::string prefix; + std::string reference; private: + Configuration(); - Configuration() ; + Configuration(Configuration const &) = delete; + void operator=(Configuration const &) = delete; - Configuration(Configuration const&) = delete ; - void operator=(Configuration const&) = delete ; + Configuration &operator[](std::string); - Configuration& operator[](std::string) ; - - cxxopts::Options parser ; + cxxopts::Options parser; }; #endif diff --git a/extender.cpp b/extender.cpp index 3c10b91..9740f4e 100644 --- a/extender.cpp +++ b/extender.cpp @@ -2,837 +2,894 @@ #include "extender.hpp" -using namespace std ; +using namespace std; using namespace lib_interval_tree; -Extender::Extender(unordered_map>* _SFSs) { - SFSs = _SFSs ; - config = Configuration::getInstance() ; - min_w = config->min_cluster_weight ; +Extender::Extender(unordered_map> *_SFSs) { + SFSs = _SFSs; + config = Configuration::getInstance(); + min_w = config->min_cluster_weight; } void Extender::run(int _threads) { - threads = _threads ; - batch_size = int(10000 / threads) * threads ; - bam_file = hts_open(config->bam.c_str(), "r") ; - bam_index = sam_index_load(bam_file, config->bam.c_str()) ; - bam_header = sam_hdr_read(bam_file) ; - bgzf_mt(bam_file->fp.bgzf, 8, 1) ; - // extend reads - _p_clips.resize(threads) ; - _p_extended_sfs.resize(threads) ; - extend_parallel() ; - for (int i = 0; i < threads; i++) { - //extended_sfs.insert(extended_sfs.begin(), _p_extended_sfs[i].begin(), _p_extended_sfs[i].end()) ; - for (const auto& extsfs: _p_extended_sfs[i]) { - extended_sfs.push_back(extsfs) ; - } - clips.insert(clips.begin(), _p_clips[i].begin(), _p_clips[i].end()) ; - } - lprint({"Extension complete.", to_string(clips.size()), " clipped SFS."}) ; - // build a separate interval tree for each chromosome - cluster_no_interval_tree() ; - // put all clusters in a single vector - lprint({"Flattening interval clusters.."}) ; - map, ExtCluster> _ext_clusters ; - for(int i = 0; i < threads; i++) { - for (const auto& cluster: _p_sfs_clusters[i]) { - _ext_clusters.insert(make_pair(cluster.first, ExtCluster(cluster.second))) ; - } + threads = _threads; + batch_size = int(10000 / threads) * threads; + bam_file = hts_open(config->bam.c_str(), "r"); + bam_index = sam_index_load(bam_file, config->bam.c_str()); + bam_header = sam_hdr_read(bam_file); + bgzf_mt(bam_file->fp.bgzf, 8, 1); + // extend reads + _p_clips.resize(threads); + _p_extended_sfs.resize(threads); + extend_parallel(); + for (int i = 0; i < threads; i++) { + // extended_sfs.insert(extended_sfs.begin(), _p_extended_sfs[i].begin(), + // _p_extended_sfs[i].end()) ; + for (const auto &extsfs : _p_extended_sfs[i]) { + extended_sfs.push_back(extsfs); + } + clips.insert(clips.begin(), _p_clips[i].begin(), _p_clips[i].end()); + } + lprint({"Extension complete.", to_string(clips.size()), " clipped SFS."}); + // build a separate interval tree for each chromosome + cluster_no_interval_tree(); + // put all clusters in a single vector + lprint({"Flattening interval clusters.."}); + map, ExtCluster> _ext_clusters; + for (int i = 0; i < threads; i++) { + for (const auto &cluster : _p_sfs_clusters[i]) { + _ext_clusters.insert( + make_pair(cluster.first, ExtCluster(cluster.second))); } - for (const auto& cluster: _ext_clusters) { - ext_clusters.push_back(cluster.second) ; - } - // process each cluster separately - _p_clusters.resize(threads) ; - extract_sfs_sequences() ; - // flatten clusters into a single vector - for (int i = 0; i < threads; i++) { - clusters.insert(clusters.begin(), _p_clusters[i].begin(), _p_clusters[i].end()) ; - } - // merge POA alignments - _p_svs.resize(threads) ; - _p_alignments.resize(threads) ; - call() ; - for (int i = 0; i < threads; i++) { - svs.insert(svs.begin(), _p_svs[i].begin(), _p_svs[i].end()) ; - alignments.insert(alignments.begin(), _p_alignments[i].begin(), _p_alignments[i].end()) ; - } - lprint({"Extracted", to_string(svs.size()), "SVs."}); - sam_close(bam_file) ; - filter_sv_chains() ; - //lprint({"Error stats: ", to_string(skip_1), " can't be mapped, ", to_string(skip_2), " can't be extended ", to_string(skip_3), " anomalies.", to_string(skip_4), " clipped."}) ; + } + for (const auto &cluster : _ext_clusters) { + ext_clusters.push_back(cluster.second); + } + // process each cluster separately + _p_clusters.resize(threads); + extract_sfs_sequences(); + // flatten clusters into a single vector + for (int i = 0; i < threads; i++) { + clusters.insert(clusters.begin(), _p_clusters[i].begin(), + _p_clusters[i].end()); + } + // merge POA alignments + _p_svs.resize(threads); + _p_alignments.resize(threads); + call(); + for (int i = 0; i < threads; i++) { + svs.insert(svs.begin(), _p_svs[i].begin(), _p_svs[i].end()); + alignments.insert(alignments.begin(), _p_alignments[i].begin(), + _p_alignments[i].end()); + } + lprint({"Extracted", to_string(svs.size()), "SVs."}); + sam_close(bam_file); + filter_sv_chains(); + // lprint({"Error stats: ", to_string(skip_1), " can't be mapped, ", + // to_string(skip_2), " can't be extended ", to_string(skip_3), " anomalies.", + // to_string(skip_4), " clipped."}) ; } -pair Extender::get_unique_kmers(const vector> &alpairs, const uint k, const bool from_end, string chrom) { - if (alpairs.size() < k) { - return make_pair(-1, -1) ; - } +pair Extender::get_unique_kmers(const vector> &alpairs, + const uint k, const bool from_end, + string chrom) { + if (alpairs.size() < k) { + return make_pair(-1, -1); + } - map kmers ; - string kmer_seq ; - int i = 0 ; - while (i < alpairs.size() - k + 1) { - bool skip = false; - for (int j = i; j < i + k; j++) { - if (alpairs[j].first == -1 || alpairs[j].second == -1) { - // we want clean kmers only - ie placed kmers, no insertions or deletions - skip = true ; - i = j + 1 ; // jump to next possible start position - break ; - } - } - if (skip) { - continue ; - } - string kmer(chromosome_seqs[chrom] + alpairs[i].second, k) ; - kmers[kmer] += 1 ; - i++ ; - } - pair last_kmer = make_pair(-1, -1) ; - i = 0 ; - while (i < alpairs.size() - k + 1) { - int offset = i ; - if (from_end) { - offset = alpairs.size() - k - i ; - } - bool skip = false; - for (int j = offset; j < offset + k; j++) { - if (alpairs[j].first == -1 || alpairs[j].second == -1) { - skip = true ; - i += (j - offset) ; // jump to next possible start position - break ; - } - } - if (skip) { - i++ ; - continue ; - } - last_kmer = alpairs[offset] ; - string kmer(chromosome_seqs[chrom] + alpairs[offset].second, k) ; - if (kmers[kmer] == 1) { - break ; - } - i++ ; + map kmers; + string kmer_seq; + int i = 0; + while (i < alpairs.size() - k + 1) { + bool skip = false; + for (int j = i; j < i + k; j++) { + if (alpairs[j].first == -1 || alpairs[j].second == -1) { + // we want clean kmers only - ie placed kmers, no insertions or + // deletions + skip = true; + i = j + 1; // jump to next possible start position + break; + } + } + if (skip) { + continue; } - return last_kmer ; + string kmer(chromosome_seqs[chrom] + alpairs[i].second, k); + kmers[kmer] += 1; + i++; + } + pair last_kmer = make_pair(-1, -1); + i = 0; + while (i < alpairs.size() - k + 1) { + int offset = i; + if (from_end) { + offset = alpairs.size() - k - i; + } + bool skip = false; + for (int j = offset; j < offset + k; j++) { + if (alpairs[j].first == -1 || alpairs[j].second == -1) { + skip = true; + i += (j - offset); // jump to next possible start position + break; + } + } + if (skip) { + i++; + continue; + } + last_kmer = alpairs[offset]; + string kmer(chromosome_seqs[chrom] + alpairs[offset].second, k); + if (kmers[kmer] == 1) { + break; + } + i++; + } + return last_kmer; } // Parallelize within each chromosome void Extender::extend_parallel() { - lprint({"Extending superstrings on", to_string(threads), "threads.."}) ; - int p = 0 ; - int b = 0 ; - for (int i = 0; i < 2; i++) { - bam_entries.push_back(vector>(threads)) ; - for (int j = 0; j < threads; j++) { - for (int k = 0; k < batch_size / threads; k++) { - bam_entries[i][j].push_back(bam_init1()) ; - } - } + lprint({"Extending superstrings on", to_string(threads), "threads.."}); + int p = 0; + int b = 0; + for (int i = 0; i < 2; i++) { + bam_entries.push_back(vector>(threads)); + for (int j = 0; j < threads; j++) { + for (int k = 0; k < batch_size / threads; k++) { + bam_entries[i][j].push_back(bam_init1()); + } } - load_batch_bam(threads, batch_size, p) ; - time_t t ; - time(&t) ; - bool should_load = true ; - bool should_process = true ; - bool should_terminate = false ; - bool loaded_last_batch = false ; - uint64_t u = 0 ; - int num_reads = 0 ; - bool printed = false ; - #pragma omp parallel num_threads(config->threads + 1) - while (!loaded_last_batch) { - //lprint({"Beginning batch", to_string(b + 1)}); - #pragma omp single - { - for (int i = 0 ; i < threads ; i++) { - u += bam_entries[p][i].size() ; - } - } - #pragma omp for - for(int i = 0; i < threads + 1; i++) { - int t = omp_get_thread_num() ; - if (t == 0) { - // load next batch of entries - if (should_load) { - loaded_last_batch = !load_batch_bam(threads, batch_size, (p + 1) % 2) ; - if (loaded_last_batch) { - //lprint({"Last input batch loaded."}); - } else { - //lprint({"Loaded."}); - } - } - } else { - process_batch(bam_entries[p][t - 1], t - 1) ; - } - } - #pragma omp single - { - //if (loaded_last_batch) { - // break ; - //} - p += 1 ; - p %= 2 ; - b += 1 ; - time_t s ; - time(&s) ; - if (s - t == 0) { - s += 1 ; - } - cerr << "[I] Processed batch " << std::left << b << ". Alignments so far: " << std::right << u << ". Alignments per second: " << u / (s - t) << ". Time: " << std::fixed << s - t << "\r" ; - printed = true ; - } + } + load_batch_bam(threads, batch_size, p); + time_t t; + time(&t); + bool should_load = true; + bool should_process = true; + bool should_terminate = false; + bool loaded_last_batch = false; + uint64_t u = 0; + int num_reads = 0; + bool printed = false; +#pragma omp parallel num_threads(config->threads + 1) + while (!loaded_last_batch) { +// lprint({"Beginning batch", to_string(b + 1)}); +#pragma omp single + { + for (int i = 0; i < threads; i++) { + u += bam_entries[p][i].size(); + } } - if (printed) { - cerr << endl ; +#pragma omp for + for (int i = 0; i < threads + 1; i++) { + int t = omp_get_thread_num(); + if (t == 0) { + // load next batch of entries + if (should_load) { + loaded_last_batch = !load_batch_bam(threads, batch_size, (p + 1) % 2); + if (loaded_last_batch) { + // lprint({"Last input batch loaded."}); + } else { + // lprint({"Loaded."}); + } + } + } else { + process_batch(bam_entries[p][t - 1], t - 1); + } } - // cleanup - for (int i = 0; i < 2; i++) { - for (int j = 0; j < threads; j++) { - for (int k = 0; k < batch_size / threads; k++) { - bam_destroy1(bam_entries[i][j][k]) ; - } - } +#pragma omp single + { + // if (loaded_last_batch) { + // break ; + // } + p += 1; + p %= 2; + b += 1; + time_t s; + time(&s); + if (s - t == 0) { + s += 1; + } + cerr << "[I] Processed batch " << std::left << b + << ". Alignments so far: " << std::right << u + << ". Alignments per second: " << u / (s - t) + << ". Time: " << std::fixed << s - t << "\r"; + printed = true; + } + } + if (printed) { + cerr << endl; + } + // cleanup + for (int i = 0; i < 2; i++) { + for (int j = 0; j < threads; j++) { + for (int k = 0; k < batch_size / threads; k++) { + bam_destroy1(bam_entries[i][j][k]); + } } - lprint({"Done."}); + } + lprint({"Done."}); } bool Extender::load_batch_bam(int threads, int batch_size, int p) { - int i = 0 ; - int n = 0 ; - while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { - bam1_t* aln = bam_entries[p][n % threads][i] ; - if (aln == nullptr) { - break ; - } - if (aln->core.flag & BAM_FUNMAP || aln->core.flag & BAM_FSUPPLEMENTARY || aln->core.flag & BAM_FSECONDARY) { - continue ; - } - char *qname = bam_get_qname(aln); - if (SFSs->find(qname) == SFSs->end()) { - continue ; - } - n += 1 ; - if (n % threads == 0) { - i += 1 ; - } - if (n == batch_size) { - return true ; - } + int i = 0; + int n = 0; + while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { + bam1_t *aln = bam_entries[p][n % threads][i]; + if (aln == nullptr) { + break; } - if (n != 0 && n != batch_size) { - for (int j = n % threads; j < threads; j++) { - //cout << "Terminus at " << j << " " << i << endl ; - bam_entries[p][j][i] = nullptr ; - } - for (int j = 0; j < n % threads; j++) { - if (i + 1 < bam_entries[p][j].size()) { - //cout << "Terminus at " << j << " " << i + 1 << endl ; - bam_entries[p][j][i + 1] = nullptr ; - } - } + if (aln->core.flag & BAM_FUNMAP || aln->core.flag & BAM_FSUPPLEMENTARY || + aln->core.flag & BAM_FSECONDARY) { + continue; + } + char *qname = bam_get_qname(aln); + if (SFSs->find(qname) == SFSs->end()) { + continue; + } + n += 1; + if (n % threads == 0) { + i += 1; + } + if (n == batch_size) { + return true; } - //lprint({"Loaded", to_string(n), "BAM reads.."}); - return n != 0 ? true : false ; + } + if (n != 0 && n != batch_size) { + for (int j = n % threads; j < threads; j++) { + // cout << "Terminus at " << j << " " << i << endl ; + bam_entries[p][j][i] = nullptr; + } + for (int j = 0; j < n % threads; j++) { + if (i + 1 < bam_entries[p][j].size()) { + // cout << "Terminus at " << j << " " << i + 1 << endl ; + bam_entries[p][j][i + 1] = nullptr; + } + } + } + // lprint({"Loaded", to_string(n), "BAM reads.."}); + return n != 0 ? true : false; } -void Extender::process_batch(vector bam_entries, int index) { - bam1_t* aln ; - for (int b = 0; b < bam_entries.size(); b++) { - aln = bam_entries[b] ; - if (aln == nullptr) { - //cout << "Terminus " << b << " " << index << " " << endl ; - break ; - } - extend_alignment(aln, index) ; +void Extender::process_batch(vector bam_entries, int index) { + bam1_t *aln; + for (int b = 0; b < bam_entries.size(); b++) { + aln = bam_entries[b]; + if (aln == nullptr) { + // cout << "Terminus " << b << " " << index << " " << endl ; + break; } + extend_alignment(aln, index); + } } -void Extender::extend_alignment(bam1_t* aln, int index) { - char *qname = bam_get_qname(aln); - uint32_t *cigar = bam_get_cigar(aln); - vector> alpairs = get_aligned_pairs(aln); - string chrom(bam_header->target_name[aln->core.tid]) ; - if (chromosome_seqs.find(chrom) == chromosome_seqs.end()) { - return ; - } +void Extender::extend_alignment(bam1_t *aln, int index) { + char *qname = bam_get_qname(aln); + uint32_t *cigar = bam_get_cigar(aln); + vector> alpairs = get_aligned_pairs(aln); + string chrom(bam_header->target_name[aln->core.tid]); + if (chromosome_seqs.find(chrom) == chromosome_seqs.end()) { + return; + } - // NOTE: we may have more sfs on a clipped read, but all of them will produce the same clip - pair lclip ; - pair rclip ; - int last_pos = 0 ; - for (const SFS &sfs : SFSs->at(qname)) { - int s = sfs.s; - int e = sfs.s + sfs.l - 1; - int aln_start = -1 ; - int aln_end = -1 ; - vector> local_alpairs; - // find start and end of SFS ibn read's alignment - int refs = -1 ; - int refe = -1 ; - for (int i = last_pos; i < alpairs.size(); i++) { - int q = alpairs[i].first; - int r = alpairs[i].second; - if (q == -1 || r == -1) { - continue ; - } else if (q < s) { // <= seems more correct to me but using < we are more flexible - last_pos = i ; - refs = r ; - aln_start = i ; - } else if (q > e) { // >= seems more correct but using > we are more flexible - refe = r ; - aln_end = i ; - break ; - } - } - // cout << refs << "-" << refe << endl ; - // We extract the local alignment of the region of interest - if (refs == -1 && refe == -1) { - // we couldn't place the first and the last base, so we skip this - otherwise we'll end up considering the entire read - ++skip_1 ; - continue ; - } else if (refs == -1) { - uint op = bam_cigar_op(*(cigar + 0)); - uint l = bam_cigar_oplen(*(cigar + 0)); - if (op == BAM_CSOFT_CLIP) { - lclip = make_pair(aln->core.pos, l); - skip_4++ ; - } - // we skip this SFS - continue ; - } else if (refe == -1) { - uint op = bam_cigar_op(*(cigar + aln->core.n_cigar - 1)); - uint l = bam_cigar_oplen(*(cigar + aln->core.n_cigar - 1)); - if (op == BAM_CSOFT_CLIP) { - rclip = make_pair(bam_endpos(aln), l); - skip_4++ ; - } - // we skip this SFS - continue ; + // NOTE: we may have more sfs on a clipped read, but all of them will produce + // the same clip + pair lclip; + pair rclip; + int last_pos = 0; + for (const SFS &sfs : SFSs->at(qname)) { + int s = sfs.s; + int e = sfs.s + sfs.l - 1; + int aln_start = -1; + int aln_end = -1; + vector> local_alpairs; + // find start and end of SFS ibn read's alignment + int refs = -1; + int refe = -1; + for (int i = last_pos; i < alpairs.size(); i++) { + int q = alpairs[i].first; + int r = alpairs[i].second; + if (q == -1 || r == -1) { + continue; + } else if (q < s) { // <= seems more correct to me but using < we are more + // flexible + last_pos = i; + refs = r; + aln_start = i; + } else if (q > + e) { // >= seems more correct but using > we are more flexible + refe = r; + aln_end = i; + break; + } + } + // cout << refs << "-" << refe << endl ; + // We extract the local alignment of the region of interest + if (refs == -1 && refe == -1) { + // we couldn't place the first and the last base, so we skip this - + // otherwise we'll end up considering the entire read + ++skip_1; + continue; + } else if (refs == -1) { + uint op = bam_cigar_op(*(cigar + 0)); + uint l = bam_cigar_oplen(*(cigar + 0)); + if (op == BAM_CSOFT_CLIP) { + lclip = make_pair(aln->core.pos, l); + skip_4++; + } + // we skip this SFS + continue; + } else if (refe == -1) { + uint op = bam_cigar_op(*(cigar + aln->core.n_cigar - 1)); + uint l = bam_cigar_oplen(*(cigar + aln->core.n_cigar - 1)); + if (op == BAM_CSOFT_CLIP) { + rclip = make_pair(bam_endpos(aln), l); + skip_4++; + } + // we skip this SFS + continue; + } else { + // we placed the first and last base, so we extract the alignment (ie a + // substring) + int last_r = refs - 1; + for (uint i = aln_start; i <= aln_end; i++) { + int q = alpairs[i].first; + int r = alpairs[i].second; + if (r == -1) { + if (refs <= last_r && last_r <= refe) { + local_alpairs.push_back(make_pair(q, r)); + } } else { - // we placed the first and last base, so we extract the alignment (ie a substring) - int last_r = refs - 1; - for (uint i = aln_start; i <= aln_end; i++) { - int q = alpairs[i].first ; - int r = alpairs[i].second ; - if (r == -1) { - if (refs <= last_r && last_r <= refe) { - local_alpairs.push_back(make_pair(q, r)); - } - } else { - last_r = r; - if (refs <= r && r <= refe) { - local_alpairs.push_back(make_pair(q, r)); - } - } - // We break when we found a placed base at or after the reference end - if (q != -1 && r != -1 && r >= refe) { - break ; - } - } + last_r = r; + if (refs <= r && r <= refe) { + local_alpairs.push_back(make_pair(q, r)); + } } - - // only if we have been able to place the SFS.. - // 3 ..we extract the maxw (100) pairs preceding the region of interest - vector> pre_alpairs ; - uint n = 0 ; - for (int i = aln_start - 1; i >= 0; --i) { - int q = alpairs[i].first; - int r = alpairs[i].second; - if (n < maxw) { - pre_alpairs.push_back(make_pair(q, r)); - n++ ; - } - } - reverse(pre_alpairs.begin(), pre_alpairs.end()); - // 4 we extract the maxw (100) pairs following the region of interest - vector> post_alpairs; - n = 0; - for (uint i = aln_end + 1; i < alpairs.size(); i++) { - int q = alpairs[i].first; - int r = alpairs[i].second; - if (n < maxw) { - post_alpairs.push_back(make_pair(q, r)); - n++ ; - } + // We break when we found a placed base at or after the reference end + if (q != -1 && r != -1 && r >= refe) { + break; } + } + } - // 5 we get the unique kmer in the upstream and downstream maxw-bp regions - pair prekmer = get_unique_kmers(pre_alpairs, kmer_size, true, chrom); // true for first kmer found (shorter cluster) - pair postkmer = get_unique_kmers(post_alpairs, kmer_size, false, chrom); // false for first kmer found (shorter cluster) + // only if we have been able to place the SFS.. + // 3 ..we extract the maxw (100) pairs preceding the region of interest + vector> pre_alpairs; + uint n = 0; + for (int i = aln_start - 1; i >= 0; --i) { + int q = alpairs[i].first; + int r = alpairs[i].second; + if (n < maxw) { + pre_alpairs.push_back(make_pair(q, r)); + n++; + } + } + reverse(pre_alpairs.begin(), pre_alpairs.end()); + // 4 we extract the maxw (100) pairs following the region of interest + vector> post_alpairs; + n = 0; + for (uint i = aln_end + 1; i < alpairs.size(); i++) { + int q = alpairs[i].first; + int r = alpairs[i].second; + if (n < maxw) { + post_alpairs.push_back(make_pair(q, r)); + n++; + } + } - // if we couldn't place a kmer, we just get the entire region - if (prekmer.first == -1 || prekmer.second == -1) { - prekmer.first = local_alpairs.front().first ; - prekmer.second = local_alpairs.front().second ; - } - if (postkmer.first == -1 || postkmer.second == -1) { - postkmer.first = local_alpairs.back().first ; - postkmer.second = local_alpairs.back().second ; - } - // if also the entire region is not correctly placed, then we skip it - // NOTE: I think we can solve this by increasing maxw - if (prekmer.first == -1 || prekmer.second == -1 || postkmer.first == -1 || postkmer.second == -1) { - ++skip_2 ; - continue ; - } - // FIXME: understand why this is happening (chr16 on full giab genome) - if ((uint) prekmer.second > postkmer.second + kmer_size) { - cerr << "Error on " << qname << ". SFS starting at " << sfs.s << " (length " << sfs.l << ")." << endl; - } else { - _p_extended_sfs[index].push_back(ExtSFS(string(chrom), string(qname), prekmer.second, postkmer.second + kmer_size)); - } + // 5 we get the unique kmer in the upstream and downstream maxw-bp regions + pair prekmer = + get_unique_kmers(pre_alpairs, kmer_size, true, + chrom); // true for first kmer found (shorter cluster) + pair postkmer = + get_unique_kmers(post_alpairs, kmer_size, false, + chrom); // false for first kmer found (shorter cluster) + + // if we couldn't place a kmer, we just get the entire region + if (prekmer.first == -1 || prekmer.second == -1) { + prekmer.first = local_alpairs.front().first; + prekmer.second = local_alpairs.front().second; + } + if (postkmer.first == -1 || postkmer.second == -1) { + postkmer.first = local_alpairs.back().first; + postkmer.second = local_alpairs.back().second; } - if (lclip.second > 0) { - _p_clips[index].push_back(Clip(qname, chrom, lclip.first, lclip.second, true)) ; + // if also the entire region is not correctly placed, then we skip it + // NOTE: I think we can solve this by increasing maxw + if (prekmer.first == -1 || prekmer.second == -1 || postkmer.first == -1 || + postkmer.second == -1) { + ++skip_2; + continue; } - if (rclip.second > 0) { - _p_clips[index].push_back(Clip(qname, chrom, rclip.first, rclip.second, false)) ; + // FIXME: understand why this is happening (chr16 on full giab genome) + if ((uint)prekmer.second > postkmer.second + kmer_size) { + cerr << "Error on " << qname << ". SFS starting at " << sfs.s + << " (length " << sfs.l << ")." << endl; + } else { + _p_extended_sfs[index].push_back(ExtSFS(string(chrom), string(qname), + prekmer.second, + postkmer.second + kmer_size)); } + } + if (lclip.second > 0) { + _p_clips[index].push_back( + Clip(qname, chrom, lclip.first, lclip.second, true)); + } + if (rclip.second > 0) { + _p_clips[index].push_back( + Clip(qname, chrom, rclip.first, rclip.second, false)); + } } -bool overlap(int s1, int e1, const ExtSFS& sfs) { - int o = max(s1, sfs.s) - min(e1, sfs.e) ; - return o >= 0 ; +bool overlap(int s1, int e1, const ExtSFS &sfs) { + int o = max(s1, sfs.s) - min(e1, sfs.e); + return o >= 0; } void Extender::cluster_no_interval_tree() { - lprint({"Sorting ", to_string(extended_sfs.size()), " extended SFS.."}); - std::sort(extended_sfs.begin(), extended_sfs.end()) ; - lprint({"Done."}) ; - auto r = std::max_element(extended_sfs.begin(), extended_sfs.end(), - [] (const ExtSFS& lhs, const ExtSFS& rhs) { - return lhs.e - lhs.s < rhs.e - rhs.s ; - }) ; - int dist = (r->e - r->s) * 1.1 ; - lprint({"Maximum extended-SFS length:", to_string(r->e - r->s), "bp. Using separation distance", to_string(dist) + "."}) ; - // find large gaps - int prev_i = 0 ; - int prev_e = extended_sfs[0].e ; - string prev_chrom = extended_sfs[0].chrom ; - vector> intervals ; - for (int i = 1 ; i < extended_sfs.size(); i++) { - const auto& sfs = extended_sfs[i] ; - // new chromosome - if (sfs.chrom != prev_chrom) { - prev_chrom = sfs.chrom ; - intervals.push_back(make_pair(prev_i, i - 1)) ; - prev_i = i ; - prev_e = sfs.e ; - continue ; - } else { - if (sfs.s - prev_e > dist) { - intervals.push_back(make_pair(prev_i, i - 1)) ; - prev_e = sfs.e ; - prev_i = i ; - } - } + lprint({"Sorting ", to_string(extended_sfs.size()), " extended SFS.."}); + std::sort(extended_sfs.begin(), extended_sfs.end()); + lprint({"Done."}); + auto r = std::max_element(extended_sfs.begin(), extended_sfs.end(), + [](const ExtSFS &lhs, const ExtSFS &rhs) { + return lhs.e - lhs.s < rhs.e - rhs.s; + }); + int dist = (r->e - r->s) * 1.1; + lprint({"Maximum extended-SFS length:", to_string(r->e - r->s), + "bp. Using separation distance", to_string(dist) + "."}); + // find large gaps + int prev_i = 0; + int prev_e = extended_sfs[0].e; + string prev_chrom = extended_sfs[0].chrom; + vector> intervals; + for (int i = 1; i < extended_sfs.size(); i++) { + const auto &sfs = extended_sfs[i]; + // new chromosome + if (sfs.chrom != prev_chrom) { + prev_chrom = sfs.chrom; + intervals.push_back(make_pair(prev_i, i - 1)); + prev_i = i; + prev_e = sfs.e; + continue; + } else { + if (sfs.s - prev_e > dist) { + intervals.push_back(make_pair(prev_i, i - 1)); + prev_e = sfs.e; + prev_i = i; + } } - intervals.push_back(make_pair(prev_i, extended_sfs.size() - 1)) ; - // cluster each interval independently - time_t f ; - time(&f) ; - time_t s ; - time(&s) ; - time_t u ; - bool printed = false ; - lprint({"Retrieved", to_string(intervals.size()), " intervals."}) ; - _p_sfs_clusters.resize(threads) ; - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int i = 0; i < intervals.size(); i++) { - int t = omp_get_thread_num() ; - interval_tree_t tree; - int j = intervals[i].first ; - int low = extended_sfs[j].s ; - int high = extended_sfs[j].e ; - int last_j = j ; - j++ ; - for (; j <= intervals[i].second; j++) { - const ExtSFS& sfs = extended_sfs[j] ; - if (sfs.s <= high) { - low = min(low, sfs.s) ; - high = max(high, sfs.e) ; - } else { - for (int k = last_j; k < j; k++) { - _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); - } - low = sfs.s ; - high = sfs.e ; - last_j = j ; - } - } - for (int k = last_j; k < intervals[i].second; k++) { - _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); - } - if (t == 0) { - time(&u) ; - if (u - s > 30) { - cerr << "[I] Processed " << std::left << i << " intervals so far. Intervals per second: " << i / (u - f) << ". Time: " << u - f << "\r" ; - time(&s) ; - printed = true ; - } - } + } + intervals.push_back(make_pair(prev_i, extended_sfs.size() - 1)); + // cluster each interval independently + time_t f; + time(&f); + time_t s; + time(&s); + time_t u; + bool printed = false; + lprint({"Retrieved", to_string(intervals.size()), " intervals."}); + _p_sfs_clusters.resize(threads); +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int i = 0; i < intervals.size(); i++) { + int t = omp_get_thread_num(); + interval_tree_t tree; + int j = intervals[i].first; + int low = extended_sfs[j].s; + int high = extended_sfs[j].e; + int last_j = j; + j++; + for (; j <= intervals[i].second; j++) { + const ExtSFS &sfs = extended_sfs[j]; + if (sfs.s <= high) { + low = min(low, sfs.s); + high = max(high, sfs.e); + } else { + for (int k = last_j; k < j; k++) { + _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); + } + low = sfs.s; + high = sfs.e; + last_j = j; + } + } + for (int k = last_j; k < intervals[i].second; k++) { + _p_sfs_clusters[t][make_pair(low, high)].push_back(extended_sfs[k]); } - if (printed) { - cerr << endl ; + if (t == 0) { + time(&u); + if (u - s > 30) { + cerr << "[I] Processed " << std::left << i + << " intervals so far. Intervals per second: " << i / (u - f) + << ". Time: " << u - f << "\r"; + time(&s); + printed = true; + } } + } + if (printed) { + cerr << endl; + } } void Extender::extract_sfs_sequences() { - lprint({"Analyzing", to_string(ext_clusters.size()), "clusters.."}) ; - char* seq[threads] ; - uint32_t len[threads] ; - bam1_t* _p_aln[threads] ; - samFile* _p_bam_file[threads] ; - hts_idx_t* _p_bam_index[threads] ; - bam_hdr_t* _p_bam_header[threads] ; - for (int i = 0; i < threads; i++) { - len[i] = 0 ; - _p_aln[i] = bam_init1() ; - _p_bam_file[i] = hts_open(config->bam.c_str(), "r") ; - _p_bam_index[i] = sam_index_load(_p_bam_file[i], config->bam.c_str()) ; - _p_bam_header[i] = sam_hdr_read(_p_bam_file[i]) ; - bgzf_mt(_p_bam_file[i]->fp.bgzf, 8, 1) ; - } - time_t f ; - time(&f) ; - time_t s ; - time(&s) ; - time_t u ; - bool printed = false ; - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int i = 0; i < ext_clusters.size(); i++) { - int t = i % threads ; - const auto& cluster = ext_clusters[i] ; + lprint({"Analyzing", to_string(ext_clusters.size()), "clusters.."}); + char *seq[threads]; + uint32_t len[threads]; + bam1_t *_p_aln[threads]; + samFile *_p_bam_file[threads]; + hts_idx_t *_p_bam_index[threads]; + bam_hdr_t *_p_bam_header[threads]; + for (int i = 0; i < threads; i++) { + len[i] = 0; + _p_aln[i] = bam_init1(); + _p_bam_file[i] = hts_open(config->bam.c_str(), "r"); + _p_bam_index[i] = sam_index_load(_p_bam_file[i], config->bam.c_str()); + _p_bam_header[i] = sam_hdr_read(_p_bam_file[i]); + bgzf_mt(_p_bam_file[i]->fp.bgzf, 8, 1); + } + time_t f; + time(&f); + time_t s; + time(&s); + time_t u; + bool printed = false; +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int i = 0; i < ext_clusters.size(); i++) { + int t = i % threads; + const auto &cluster = ext_clusters[i]; - unordered_map reads ; - int cluster_s = numeric_limits::max() ; - int cluster_e = 0; - for (const ExtSFS &esfs: cluster.seqs) { - cluster_s = min(cluster_s, esfs.s); - cluster_e = max(cluster_e, esfs.e); - reads[esfs.qname] = true ; - } - uint cluster_size = reads.size(); - if (cluster_size < min_w) { - ++small_cl ; - continue ; - } - - string chrom = cluster.chrom ; - Cluster global_cluster = Cluster(chrom, cluster_s, cluster_e); + unordered_map reads; + int cluster_s = numeric_limits::max(); + int cluster_e = 0; + for (const ExtSFS &esfs : cluster.seqs) { + cluster_s = min(cluster_s, esfs.s); + cluster_e = max(cluster_e, esfs.e); + reads[esfs.qname] = true; + } + uint cluster_size = reads.size(); + if (cluster_size < min_w) { + ++small_cl; + continue; + } - uint cov = 0; - string region = chrom + ":" + to_string(cluster_s) + "-" + to_string(cluster_e); - hts_itr_t *itr = sam_itr_querys(_p_bam_index[t], _p_bam_header[t], region.c_str()); - while (sam_itr_next(_p_bam_file[t], itr, _p_aln[t]) > 0) { - bam1_t* aln = _p_aln[t] ; - if (aln->core.flag & BAM_FUNMAP || aln->core.flag & BAM_FSUPPLEMENTARY || aln->core.flag & BAM_FSECONDARY) { - continue; - } - ++cov; // FIXME: this cov takes into account also reads starting or ending inside the cluster (maybe we should skip those?) + string chrom = cluster.chrom; + Cluster global_cluster = Cluster(chrom, cluster_s, cluster_e); - char *qname = bam_get_qname(aln); - if (reads.find(qname) == reads.end()) { - continue ; - } + uint cov = 0; + string region = + chrom + ":" + to_string(cluster_s) + "-" + to_string(cluster_e); + hts_itr_t *itr = + sam_itr_querys(_p_bam_index[t], _p_bam_header[t], region.c_str()); + while (sam_itr_next(_p_bam_file[t], itr, _p_aln[t]) > 0) { + bam1_t *aln = _p_aln[t]; + if (aln->core.flag & BAM_FUNMAP || aln->core.flag & BAM_FSUPPLEMENTARY || + aln->core.flag & BAM_FSECONDARY) { + continue; + } + ++cov; // FIXME: this cov takes into account also reads starting or ending + // inside the cluster (maybe we should skip those?) - uint32_t l = aln->core.l_qseq; - if (l >= len[t]) { - if (len[t] != 0) { - free(seq[t]) ; - } - len[t] = l ; - seq[t] = (char*) malloc(l + 1) ; - } - uint8_t *q = bam_get_seq(aln) ; - for (int i = 0; i < l; i++){ - seq[t][i] = seq_nt16_str[bam_seqi(q, i)]; - } - seq[t][l] = '\0'; + char *qname = bam_get_qname(aln); + if (reads.find(qname) == reads.end()) { + continue; + } - vector> alpairs = get_aligned_pairs(aln); - int qs = -1, qe = -1; - // getting starting and ending positions on read sequence aligning to start/end of cluster - for (int i = alpairs.size() - 1; i >= 0; --i) { - // finding starting position - int q = alpairs[i].first; - int r = alpairs[i].second; - if (q == -1 || r == -1) { - continue; - } - if (r <= cluster_s) { - qs = q; - break; - } - } - for (uint i = 0; i < alpairs.size(); ++i) { - // finding ending position - int q = alpairs[i].first; - int r = alpairs[i].second; - if (q == -1 || r == -1) { - continue; - } - if (r >= cluster_e) { - qe = q; - break; - } - } - if (qs == -1 || qe == -1) { - // reads starts or ends inside the cluster - // TODO: get only remaining prefix/suffix? but this may make POA and realignment harder - ++skip_3; - } else { - string _seq(seq[t], qs, qe - qs + 1) ; - global_cluster.add(_seq) ; - } - } - if (global_cluster.size() >= min_w) { - global_cluster.set_cov(cov) ; - _p_clusters[t].push_back(global_cluster) ; - ++extcl ; - } else { - ++small_extcl ; - } - if (t == 0) { - time(&u) ; - if (u - s > 30) { - cerr << "[I] Processed " << std::left << i << " clusters so far. Clusters per second: " << i / (u - f) << ". Time: " << u - f << "\r" ; - time(&s) ; - printed = true ; - } - } + uint32_t l = aln->core.l_qseq; + if (l >= len[t]) { + if (len[t] != 0) { + free(seq[t]); + } + len[t] = l; + seq[t] = (char *)malloc(l + 1); + } + uint8_t *q = bam_get_seq(aln); + for (int i = 0; i < l; i++) { + seq[t][i] = seq_nt16_str[bam_seqi(q, i)]; + } + seq[t][l] = '\0'; + + vector> alpairs = get_aligned_pairs(aln); + int qs = -1, qe = -1; + // getting starting and ending positions on read sequence aligning to + // start/end of cluster + for (int i = alpairs.size() - 1; i >= 0; --i) { + // finding starting position + int q = alpairs[i].first; + int r = alpairs[i].second; + if (q == -1 || r == -1) { + continue; + } + if (r <= cluster_s) { + qs = q; + break; + } + } + for (uint i = 0; i < alpairs.size(); ++i) { + // finding ending position + int q = alpairs[i].first; + int r = alpairs[i].second; + if (q == -1 || r == -1) { + continue; + } + if (r >= cluster_e) { + qe = q; + break; + } + } + if (qs == -1 || qe == -1) { + // reads starts or ends inside the cluster + // TODO: get only remaining prefix/suffix? but this may make POA and + // realignment harder + ++skip_3; + } else { + string _seq(seq[t], qs, qe - qs + 1); + global_cluster.add(_seq); + } } - if (printed) { - cerr << endl ; + if (global_cluster.size() >= min_w) { + global_cluster.set_cov(cov); + _p_clusters[t].push_back(global_cluster); + ++extcl; + } else { + ++small_extcl; } - for (int i = 0; i < threads; i++) { - if (len[i] > 0) { - free(seq[i]) ; - } - bam_destroy1(_p_aln[i]) ; - sam_close(_p_bam_file[i]) ; + if (t == 0) { + time(&u); + if (u - s > 30) { + cerr << "[I] Processed " << std::left << i + << " clusters so far. Clusters per second: " << i / (u - f) + << ". Time: " << u - f << "\r"; + time(&s); + printed = true; + } } + } + if (printed) { + cerr << endl; + } + for (int i = 0; i < threads; i++) { + if (len[i] > 0) { + free(seq[i]); + } + bam_destroy1(_p_aln[i]); + sam_close(_p_bam_file[i]); + } } -vector Extender::cluster_by_length(const Cluster& cluster) { - vector clusters_by_len; - for (const string &seq: cluster.get_seqs()) { - int i; - for (i = 0; i < clusters_by_len.size(); i++) { - if (abs((int) clusters_by_len[i].get_len() - (int) seq.size()) <= min_d) { - break ; - } - } - if (i == clusters_by_len.size()) { - clusters_by_len.push_back(Cluster(cluster.chrom, cluster.s, cluster.e, cluster.cov)); - } - clusters_by_len[i].add(seq); +vector Extender::cluster_by_length(const Cluster &cluster) { + vector clusters_by_len; + for (const string &seq : cluster.get_seqs()) { + int i; + for (i = 0; i < clusters_by_len.size(); i++) { + if (abs((int)clusters_by_len[i].get_len() - (int)seq.size()) <= min_d) { + break; + } + } + if (i == clusters_by_len.size()) { + clusters_by_len.push_back( + Cluster(cluster.chrom, cluster.s, cluster.e, cluster.cov)); } - return clusters_by_len ; + clusters_by_len[i].add(seq); + } + return clusters_by_len; } vector> Extender::parse_cigar(string cigar) { - // -- Parsing CIGAR - vector> cigar_pairs ; - regex r("([0-9]+)([MIDNSHPX=])") ; - regex_iterator rit(cigar.begin(), cigar.end(), r) ; - regex_iterator rend ; - while (rit != rend) { - int l = stoi(rit->str().substr(0, rit->str().size() - 1)) ; - char op = rit->str().substr(rit->str().size() - 1, 1)[0] ; - cigar_pairs.push_back(make_pair(l, op)) ; - ++rit ; - } - return cigar_pairs ; + // -- Parsing CIGAR + vector> cigar_pairs; + regex r("([0-9]+)([MIDNSHPX=])"); + regex_iterator rit(cigar.begin(), cigar.end(), r); + regex_iterator rend; + while (rit != rend) { + int l = stoi(rit->str().substr(0, rit->str().size() - 1)); + char op = rit->str().substr(rit->str().size() - 1, 1)[0]; + cigar_pairs.push_back(make_pair(l, op)); + ++rit; + } + return cigar_pairs; } void Extender::call() { - lprint({"Calling SVs from", to_string(clusters.size()), "clusters.."}) ; - time_t f ; - time(&f) ; - time_t s ; - time(&s) ; - time_t u ; - bool printed = false ; - #pragma omp parallel for num_threads(threads) schedule(static,1) - for (int _ = 0; _ < clusters.size(); _++) { - int t = _ % threads ; - const Cluster &cluster = clusters[_] ; - string chrom = cluster.chrom ; - if (cluster.size() < min_w) { - continue ; - } - const auto& clusters_by_len = cluster_by_length(cluster) ; - // --- Sorting clusters by #sequences to get first 2 most weighted clusters - int i_max1 = -1 ; - int i_max2 = -1 ; - uint v_max1 = 0 ; - uint v_max2 = 0 ; - for (uint i = 0; i < clusters_by_len.size(); ++i) { - if (clusters_by_len[i].size() > v_max1) { - v_max2 = v_max1; - i_max2 = i_max1; - v_max1 = clusters_by_len[i].size(); - i_max1 = i; - } - else if (clusters_by_len[i].size() > v_max2) { - v_max2 = clusters_by_len[i].size(); - i_max2 = i; - } - } - vector maxs ({i_max1, i_max2}); - // Genotyping the two most-weighted clusters - for (const int i : maxs) { - if (i == -1) { - continue; - } - Cluster c = clusters_by_len[i]; - if (c.size() < min_w) { - continue; - } - // --- Local realignment - vector _svs ; // svs on current cluster - string ref = string(chromosome_seqs[chrom] + c.s, c.e - c.s + 1); - string consensus = c.poa() ; - parasail_result_t *result = NULL; - result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), 10, 1, ¶sail_blosum62); - parasail_cigar_t *cigar = parasail_result_get_cigar(result, consensus.c_str(), consensus.size(), ref.c_str(), ref.size(), NULL); - string cigar_str = parasail_cigar_decode(cigar); - int score = result->score ; - parasail_cigar_free(cigar) ; - parasail_result_free(result) ; - _p_alignments[t].push_back(Consensus(consensus, cigar_str, chrom, c.s, c.e)) ; - // -- Extracting SVs - uint rpos = c.s ; // position on reference - uint cpos = 0 ; // position on consensus - auto cigar_pairs = parse_cigar(cigar_str) ; - int nv = 0 ; - for (const auto cigar_pair: cigar_pairs) { - uint l = cigar_pair.first; - char op = cigar_pair.second; - if (op == '=' || op == 'M') { - rpos += l; - cpos += l; - } else if (op == 'I') { - if (l > config->min_sv_length) { - SV sv = SV("INS", c.chrom, rpos, string(chromosome_seqs[chrom] + rpos - 1, 1), consensus.substr(cpos, l), c.size(), c.cov, nv, score, false, l) ; - _svs.push_back(sv) ; - nv++ ; - } - cpos += l; - } else if (op == 'D') { - if (l > config->min_sv_length) { - SV sv = SV("DEL", c.chrom, rpos, string(chromosome_seqs[chrom] + rpos - 1, l), string(chromosome_seqs[chrom] + rpos - 1, 1), c.size(), c.cov, nv, score, false, l) ; - _svs.push_back(sv) ; - nv++ ; - } - rpos += l; - } - } - for (int v = 0; v < _svs.size(); v++) { - _svs[v].ngaps = nv ; - } - // --- combine SVs on same consensus --- - //vector merged_svs ; - //std::sort(_svs.begin(), _svs.end()) ; - //for (int i = 1; i < _svs.size(); i++) { - // if (_svs[i].type == "DEL" == ) { - // _dels.push_back(_svs[i]) ; - // } else { - // _ins.push_back(_svs[i]) ; - // } - //} - // merge only overlapping SVs - //SV sv = _dels[0] ; - //for (int i = 1; i < _dels.size(); i++) { - // auto& del = _dels[i] ; - // if (del.s <= sv.e) { - // sv.e = del.e ; - // int overlap = sv.e - del.s + 1 ; - // sv.refall = sv.altall + del..substr(overlap, del.l - overlap) ; - // } - //} - // -- Combine svs with same length (maybe useless now - only if diploid mode) - //vector comb_svs; - //for (const SV &msv : merged_svs) { - // bool newsv_flag = true ; - // for (SV &csv: comb_svs) { - // if (abs(csv.l - msv.l) <= 10) { - // csv.w += msv.w ; - // newsv_flag = false ; - // } - // } - // if (newsv_flag) { - // comb_svs.push_back(msv); - // } - //} - for (const SV &sv: _svs) { - _p_svs[t].push_back(sv); - } - } - if (t == 0) { - time(&u) ; - if (u - s > 30) { - cerr << "[I] Processed " << std::left << _ << " clusters so far. Cluster per second: " << _ / (u - f) << ". Time: " << u - f << "\r" ; - time(&s) ; - } - } + lprint({"Calling SVs from", to_string(clusters.size()), "clusters.."}); + time_t f; + time(&f); + time_t s; + time(&s); + time_t u; + bool printed = false; +#pragma omp parallel for num_threads(threads) schedule(static, 1) + for (int _ = 0; _ < clusters.size(); _++) { + int t = _ % threads; + const Cluster &cluster = clusters[_]; + string chrom = cluster.chrom; + if (cluster.size() < min_w) { + continue; + } + const auto &clusters_by_len = cluster_by_length(cluster); + // --- Sorting clusters by #sequences to get first 2 most weighted clusters + int i_max1 = -1; + int i_max2 = -1; + uint v_max1 = 0; + uint v_max2 = 0; + for (uint i = 0; i < clusters_by_len.size(); ++i) { + if (clusters_by_len[i].size() > v_max1) { + v_max2 = v_max1; + i_max2 = i_max1; + v_max1 = clusters_by_len[i].size(); + i_max1 = i; + } else if (clusters_by_len[i].size() > v_max2) { + v_max2 = clusters_by_len[i].size(); + i_max2 = i; + } + } + vector maxs({i_max1, i_max2}); + // Genotyping the two most-weighted clusters + for (const int i : maxs) { + if (i == -1) { + continue; + } + Cluster c = clusters_by_len[i]; + if (c.size() < min_w) { + continue; + } + // --- Local realignment + vector _svs; // svs on current cluster + string ref = string(chromosome_seqs[chrom] + c.s, c.e - c.s + 1); + string consensus = c.poa(); + parasail_result_t *result = NULL; + result = parasail_nw_trace_striped_16(consensus.c_str(), consensus.size(), + ref.c_str(), ref.size(), 10, 1, + ¶sail_blosum62); + parasail_cigar_t *cigar = + parasail_result_get_cigar(result, consensus.c_str(), consensus.size(), + ref.c_str(), ref.size(), NULL); + string cigar_str = parasail_cigar_decode(cigar); + int score = result->score; + parasail_cigar_free(cigar); + parasail_result_free(result); + _p_alignments[t].push_back( + Consensus(consensus, cigar_str, chrom, c.s, c.e)); + // -- Extracting SVs + uint rpos = c.s; // position on reference + uint cpos = 0; // position on consensus + auto cigar_pairs = parse_cigar(cigar_str); + int nv = 0; + for (const auto cigar_pair : cigar_pairs) { + uint l = cigar_pair.first; + char op = cigar_pair.second; + if (op == '=' || op == 'M') { + rpos += l; + cpos += l; + } else if (op == 'I') { + if (l > config->min_sv_length) { + SV sv = SV("INS", c.chrom, rpos, + string(chromosome_seqs[chrom] + rpos - 1, 1), + consensus.substr(cpos, l), c.size(), c.cov, nv, score, + false, l); + _svs.push_back(sv); + nv++; + } + cpos += l; + } else if (op == 'D') { + if (l > config->min_sv_length) { + SV sv = SV("DEL", c.chrom, rpos, + string(chromosome_seqs[chrom] + rpos - 1, l), + string(chromosome_seqs[chrom] + rpos - 1, 1), c.size(), + c.cov, nv, score, false, l); + _svs.push_back(sv); + nv++; + } + rpos += l; + } + } + for (int v = 0; v < _svs.size(); v++) { + _svs[v].ngaps = nv; + } + // --- combine SVs on same consensus --- + // vector merged_svs ; + // std::sort(_svs.begin(), _svs.end()) ; + // for (int i = 1; i < _svs.size(); i++) { + // if (_svs[i].type == "DEL" == ) { + // _dels.push_back(_svs[i]) ; + // } else { + // _ins.push_back(_svs[i]) ; + // } + //} + // merge only overlapping SVs + // SV sv = _dels[0] ; + // for (int i = 1; i < _dels.size(); i++) { + // auto& del = _dels[i] ; + // if (del.s <= sv.e) { + // sv.e = del.e ; + // int overlap = sv.e - del.s + 1 ; + // sv.refall = sv.altall + del..substr(overlap, del.l - overlap) ; + // } + //} + // -- Combine svs with same length (maybe useless now - only if diploid + // mode) + // vector comb_svs; + // for (const SV &msv : merged_svs) { + // bool newsv_flag = true ; + // for (SV &csv: comb_svs) { + // if (abs(csv.l - msv.l) <= 10) { + // csv.w += msv.w ; + // newsv_flag = false ; + // } + // } + // if (newsv_flag) { + // comb_svs.push_back(msv); + // } + //} + for (const SV &sv : _svs) { + _p_svs[t].push_back(sv); + } } - if (printed) { - cerr << endl ; + if (t == 0) { + time(&u); + if (u - s > 30) { + cerr << "[I] Processed " << std::left << _ + << " clusters so far. Cluster per second: " << _ / (u - f) + << ". Time: " << u - f << "\r"; + time(&s); + } } + } + if (printed) { + cerr << endl; + } } void Extender::filter_sv_chains() { - std::sort(svs.begin(), svs.end()) ; - if (svs.size() < 2) { - return ; - } - lprint({to_string(svs.size()), "SVs before chain filtering."}) ; - vector _svs ; - auto& prev = svs[0] ; - bool reset = false ; - for (int i = 1; i < svs.size(); i++) { - if (reset) { - reset = false ; - prev = svs[i] ; - continue ; - } - auto& sv = svs[i] ; - if (sv.chrom == prev.chrom && sv.s - prev.e < 2 * sv.l && prev.type == sv.type) { - //cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s << endl ; - // check for sequence similarity - double w_r = max((double)sv.w, (double)prev.w) / min((double)sv.w, (double)prev.w) ; - double l_r = min((double)sv.l, (double)prev.l) / max((double)sv.l, (double)prev.l) ; - if (w_r >= 2 && l_r >= 0.9) { - double d ; - if (sv.type == "DEL") { - d = rapidfuzz::fuzz::ratio(sv.refall, prev.refall) ; - } else { - d = rapidfuzz::fuzz::ratio(sv.altall, prev.altall) ; - } - //cout << w_r << " " << l_r << " " << d << endl ; - if (d > 70) { - if (sv.w > prev.w) { - _svs.push_back(sv) ; - } else { - _svs.push_back(prev) ; - } - reset = true ; - continue ; - } - } - } - if (prev.s == 205603627) { - break ; - } - _svs.push_back(prev) ; - prev = sv ; - } - _svs.push_back(prev) ; - svs.clear() ; - svs.insert(svs.begin(), _svs.begin(), _svs.end()) ; - lprint({to_string(svs.size()), "SVs after chain filtering."}) ; + std::sort(svs.begin(), svs.end()); + if (svs.size() < 2) { + return; + } + lprint({to_string(svs.size()), "SVs before chain filtering."}); + vector _svs; + auto &prev = svs[0]; + bool reset = false; + for (int i = 1; i < svs.size(); i++) { + if (reset) { + reset = false; + prev = svs[i]; + continue; + } + auto &sv = svs[i]; + if (sv.chrom == prev.chrom && sv.s - prev.e < 2 * sv.l && + prev.type == sv.type) { + // cout << sv.chrom << " " << sv.s << " " << sv.type << " ---- " << prev.s + // << endl ; + // check for sequence similarity + double w_r = + max((double)sv.w, (double)prev.w) / min((double)sv.w, (double)prev.w); + double l_r = + min((double)sv.l, (double)prev.l) / max((double)sv.l, (double)prev.l); + if (w_r >= 2 && l_r >= 0.9) { + double d; + if (sv.type == "DEL") { + d = rapidfuzz::fuzz::ratio(sv.refall, prev.refall); + } else { + d = rapidfuzz::fuzz::ratio(sv.altall, prev.altall); + } + // cout << w_r << " " << l_r << " " << d << endl ; + if (d > 70) { + if (sv.w > prev.w) { + _svs.push_back(sv); + } else { + _svs.push_back(prev); + } + reset = true; + continue; + } + } + } + if (prev.s == 205603627) { + break; + } + _svs.push_back(prev); + prev = sv; + } + _svs.push_back(prev); + svs.clear(); + svs.insert(svs.begin(), _svs.begin(), _svs.end()); + lprint({to_string(svs.size()), "SVs after chain filtering."}); } diff --git a/extender.hpp b/extender.hpp index 75c6769..00b5741 100644 --- a/extender.hpp +++ b/extender.hpp @@ -1,95 +1,99 @@ #ifndef EXTENDER_HPP #define EXTENDER_HPP -#include +#include #include +#include +#include #include #include -#include -#include -#include "parasail.h" -#include "htslib/sam.h" #include "htslib/hts.h" +#include "htslib/sam.h" #include "interval_tree.hpp" +#include "parasail.h" +#include "parasail/matrices/blosum62.h" #include "rapidfuzz/fuzz.hpp" #include "rapidfuzz/utils.hpp" -#include "parasail/matrices/blosum62.h" -#include "sv.hpp" #include "bam.hpp" -#include "sfs.hpp" -#include "config.hpp" -#include "cluster.hpp" -#include "clipper.hpp" #include "chromosomes.hpp" +#include "clipper.hpp" +#include "cluster.hpp" +#include "config.hpp" +#include "sfs.hpp" +#include "sv.hpp" using namespace lib_interval_tree; class Extender { private: + // parameters: + uint min_w = 4; + uint min_d = 15; + uint maxw = 100; + uint kmer_size = 7; + // book keeping: + uint skip_1 = 0; // SFS skipped since no first/last base can be placed from + // read alignment (should be rare) + uint skip_2 = 0; // SFS skipped since it couldn't be extended + uint skip_3 = 0; // SFS skipped since reads starts/ends inside a cluster + uint skip_4 = 0; // SFS skipped since reads starts/ends inside a cluster + uint extcl = 0; // number of extended clusters (after clustering) + uint small_cl = 0; // number of cluster (before clustering) with low support + uint small_extcl = + 0; // number of extended clusters (after clustering) with low support - // parameters: - uint min_w = 4 ; - uint min_d = 15 ; - uint maxw = 100 ; - uint kmer_size = 7 ; - // book keeping: - uint skip_1 = 0 ; // SFS skipped since no first/last base can be placed from read alignment (should be rare) - uint skip_2 = 0 ; // SFS skipped since it couldn't be extended - uint skip_3 = 0 ; // SFS skipped since reads starts/ends inside a cluster - uint skip_4 = 0 ; // SFS skipped since reads starts/ends inside a cluster - uint extcl = 0 ; // number of extended clusters (after clustering) - uint small_cl = 0 ; // number of cluster (before clustering) with low support - uint small_extcl = 0 ; // number of extended clusters (after clustering) with low support - - Configuration* config ; - - samFile* bam_file ; - hts_idx_t* bam_index ; - bam_hdr_t* bam_header ; - - std::vector clusters ; - std::vector ext_clusters ; - std::unordered_map>* SFSs ; - std::vector extended_sfs ; - - void extend_parallel() ; - void extend_alignment(bam1_t* aln, int index) ; - void process_batch(vector bam_entries, int) ; - bool load_batch_bam(int threads, int batch_size, int p) ; - std::pair get_unique_kmers(const std::vector> &alpairs, const uint k, const bool from_end, std::string chrom) ; - - void extract_sfs_sequences() ; - void cluster_interval_tree() ; - void cluster_no_interval_tree() ; - - void call() ; - void filter_sv_chains() ; - std::vector> parse_cigar(std::string) ; - std::vector cluster_by_length(const Cluster& cluster) ; - - // parallelize - int threads ; - int batch_size ; - std::vector> _p_svs ; - std::vector> _p_clips ; - std::vector> _p_extended_sfs ; - std::vector> _p_clusters ; - std::vector> _p_alignments ; - std::vector>> bam_entries ; - std::vector>> _p_tree ; - std::vector, std::vector>> _p_sfs_clusters ; + Configuration *config; + + samFile *bam_file; + hts_idx_t *bam_index; + bam_hdr_t *bam_header; + + std::vector clusters; + std::vector ext_clusters; + std::unordered_map> *SFSs; + std::vector extended_sfs; + + void extend_parallel(); + void extend_alignment(bam1_t *aln, int index); + void process_batch(vector bam_entries, int); + bool load_batch_bam(int threads, int batch_size, int p); + std::pair + get_unique_kmers(const std::vector> &alpairs, + const uint k, const bool from_end, std::string chrom); + + void extract_sfs_sequences(); + void cluster_interval_tree(); + void cluster_no_interval_tree(); + + void call(); + void filter_sv_chains(); + std::vector> parse_cigar(std::string); + std::vector cluster_by_length(const Cluster &cluster); + + // parallelize + int threads; + int batch_size; + std::vector> _p_svs; + std::vector> _p_clips; + std::vector> _p_extended_sfs; + std::vector> _p_clusters; + std::vector> _p_alignments; + std::vector>> bam_entries; + std::vector>> _p_tree; + std::vector, std::vector>> + _p_sfs_clusters; public: - Extender(std::unordered_map>*) ; - - std::vector svs ; - std::vector clips ; - std::vector alignments ; + Extender(std::unordered_map> *); + + std::vector svs; + std::vector clips; + std::vector alignments; - void run(int threads) ; + void run(int threads); }; #endif diff --git a/fastq.hpp b/fastq.hpp index fa01e10..09e8c6c 100644 --- a/fastq.hpp +++ b/fastq.hpp @@ -1,43 +1,43 @@ #ifndef FST_HPP #define FST_HPP +#include #include -#include #include -#include #include +#include #ifdef __GNUC__ #pragma GCC system_header #endif -#include #include "kseq.h" +#include KSEQ_INIT(gzFile, gzread) struct fastq_entry_t { - std::string head ; - std::string seq ; - std::string qual ; - uint start ; - uint len ; - // constructor - fastq_entry_t(const std::string &h, const std::string &s, const std::string &q, const uint st = 0, const uint l = 0) : head(h), seq(s), qual(q) { - len = l ; - start = st ; - } - bool operator==(const fastq_entry_t &o) const { - return seq == o.seq ; - } + std::string head; + std::string seq; + std::string qual; + uint start; + uint len; + // constructor + fastq_entry_t(const std::string &h, const std::string &s, + const std::string &q, const uint st = 0, const uint l = 0) + : head(h), seq(s), qual(q) { + len = l; + start = st; + } + bool operator==(const fastq_entry_t &o) const { return seq == o.seq; } }; namespace std { - template <> struct hash { - std::size_t operator()(const fastq_entry_t& k) const { - return std::hash()(k.seq) ; - } - }; -} +template <> struct hash { + std::size_t operator()(const fastq_entry_t &k) const { + return std::hash()(k.seq); + } +}; +} // namespace std #endif diff --git a/lprint.cpp b/lprint.cpp index eaa8458..ff2c418 100644 --- a/lprint.cpp +++ b/lprint.cpp @@ -1,26 +1,28 @@ #include "lprint.hpp" -void lprint(const initializer_list &tokens, int loglevel, int minlevel) { - if (loglevel < minlevel) { - return; - } - char ctype; - switch (loglevel) { - case 0: - ctype = 'I'; - break; - case 1: - ctype = 'W'; - break; - case 2: - ctype = 'E'; - break; - default: - ctype = 'I'; - } - cerr << "[" << ctype << "]" << " " ; - for (const string &token : tokens) { - cerr << token << " "; - } - cerr << endl; +void lprint(const initializer_list &tokens, int loglevel, + int minlevel) { + if (loglevel < minlevel) { + return; + } + char ctype; + switch (loglevel) { + case 0: + ctype = 'I'; + break; + case 1: + ctype = 'W'; + break; + case 2: + ctype = 'E'; + break; + default: + ctype = 'I'; + } + cerr << "[" << ctype << "]" + << " "; + for (const string &token : tokens) { + cerr << token << " "; + } + cerr << endl; } diff --git a/main.cpp b/main.cpp index f893081..f2a5992 100644 --- a/main.cpp +++ b/main.cpp @@ -1,39 +1,39 @@ -#include -#include -#include -#include +#include #include -#include +#include +#include #include #include -#include -#include #include +#include +#include +#include #include -#include +#include #include #include +#include #include +#include "chromosomes.hpp" #include "config.hpp" #include "lprint.hpp" #include "ping_pong.hpp" -#include "chromosomes.hpp" -#include "caller.hpp" #include "assembler.hpp" +#include "caller.hpp" #include "smoother.hpp" #ifdef LOCAL_BUILD -#include "finder.hpp" -#include "shifter.hpp" -#include "scanner.hpp" #include "extractor.hpp" -#include "kmer_finder.hpp" +#include "finder.hpp" #include "haplotype_shifter.hpp" +#include "kmer_finder.hpp" +#include "scanner.hpp" +#include "shifter.hpp" #endif -using namespace std ; +using namespace std; const string version = "v1.0.4"; @@ -43,103 +43,122 @@ const string version = "v1.0.4"; // ============================================================================= \\ void create_workdir() { - auto c = Configuration::getInstance() ; - struct stat info ; - lprint({c->workdir}, 0); - if (stat(c->workdir.c_str(), &info) != 0) { - lprint({"Working directory does not exist. creating.."}, 0); - int check = mkdir(c->workdir.c_str(), 0777) ; - if (check != 0) { - lprint({"Failed to create output directory, aborting.."}, 2); - exit(check) ; - } + auto c = Configuration::getInstance(); + struct stat info; + lprint({c->workdir}, 0); + if (stat(c->workdir.c_str(), &info) != 0) { + lprint({"Working directory does not exist. creating.."}, 0); + int check = mkdir(c->workdir.c_str(), 0777); + if (check != 0) { + lprint({"Failed to create output directory, aborting.."}, 2); + exit(check); } + } } void print_help() { - cerr << endl; - cerr << "Usage: " << endl; - cerr << "\t* Index reference/sample:" << endl ; - cerr << "\t\tSVDSS index --fastq/--fasta /path/to/genome/file --index /path/to/output/index/file" << endl ; - cerr << endl; - cerr << "\t\tOptional arguments: " << endl ; - cerr << "\t\t\t-b, --binary\t\t\t\toutput index in binary format. Allows for another index to be appended to this index later." << endl ; - cerr << "\t\t\t-a, --append /path/to/binary/index\tappend to existing binary index." << endl ; - cerr << endl; - cerr << "\t* Extract SFS from BAM/FASTQ/FASTA files:" << endl ; - cerr << "\t\tSVDSS search --index /path/to/index --fastq/--bam /path/to/input --workdir /output/directory" << endl ; - cerr << endl; - cerr << "\t\tOptional arguments: " << endl ; - cerr << "\t\t\t--assemble\t\t\t\tautomatically runs SVDSS assemble on output" << endl ; - cerr << endl; - cerr << "\t* Assemble SFS into superstrings:" << endl ; - cerr << "\t\tSVDSS assemble --workdir /path/to/.sfs/files --batches /number/of/SFS/batches" << endl ; - cerr << endl; - cerr << "\t* Reconstruct sample:" << endl ; - cerr << "\t\tSVDSS smooth --workdir /output/file/direcotry --bam /path/to/input/bam/file --reference /path/to/reference/genome/fasta" << endl ; - cerr << endl; - cerr << "\t* Call SVs:" << endl ; - cerr << "\t\tSVDSS call --workdir /path/to/assembled/.sfs/files --bam /path/to/input/bam/file --reference /path/to/reference/genome/fasta" << endl ; - cerr << endl; - cerr << "\t\tOptional arguments: " << endl ; - cerr << "\t\t\t--clipped\t\t\t\tcalls SVs from clipped SFS." << endl ; - cerr << "\t\t\t--min-cluster-weight\t\t\tminimum number of supporting superstrings for a call to be reported." << endl ; - cerr << "\t\t\t--min-sv-length\t\t\t\tminimum length of reported SVs. Default is 25. Values < 25 are ignored." << endl ; - cerr << endl; - cerr << "General options: " << endl ; - cerr << "\t--threads\t\t\t\t\t\tsets number of threads, default 4." << endl ; - cerr << "\t--version\t\t\t\t\t\tprint version information." << endl ; - cerr << "\t--help\t\t\t\t\t\t\tprint this help message." << endl ; - cerr << endl; + cerr << endl; + cerr << "Usage: " << endl; + cerr << "\t* Index reference/sample:" << endl; + cerr << "\t\tSVDSS index --fastq/--fasta /path/to/genome/file --index " + "/path/to/output/index/file" + << endl; + cerr << endl; + cerr << "\t\tOptional arguments: " << endl; + cerr << "\t\t\t-b, --binary\t\t\t\toutput index in binary format. Allows for " + "another index to be appended to this index later." + << endl; + cerr << "\t\t\t-a, --append /path/to/binary/index\tappend to existing binary " + "index." + << endl; + cerr << endl; + cerr << "\t* Extract SFS from BAM/FASTQ/FASTA files:" << endl; + cerr << "\t\tSVDSS search --index /path/to/index --fastq/--bam " + "/path/to/input --workdir /output/directory" + << endl; + cerr << endl; + cerr << "\t\tOptional arguments: " << endl; + cerr << "\t\t\t--assemble\t\t\t\tautomatically runs SVDSS assemble on output" + << endl; + cerr << endl; + cerr << "\t* Assemble SFS into superstrings:" << endl; + cerr << "\t\tSVDSS assemble --workdir /path/to/.sfs/files --batches " + "/number/of/SFS/batches" + << endl; + cerr << endl; + cerr << "\t* Reconstruct sample:" << endl; + cerr << "\t\tSVDSS smooth --workdir /output/file/direcotry --bam " + "/path/to/input/bam/file --reference /path/to/reference/genome/fasta" + << endl; + cerr << endl; + cerr << "\t* Call SVs:" << endl; + cerr << "\t\tSVDSS call --workdir /path/to/assembled/.sfs/files --bam " + "/path/to/input/bam/file --reference /path/to/reference/genome/fasta" + << endl; + cerr << endl; + cerr << "\t\tOptional arguments: " << endl; + cerr << "\t\t\t--clipped\t\t\t\tcalls SVs from clipped SFS." << endl; + cerr << "\t\t\t--min-cluster-weight\t\t\tminimum number of supporting " + "superstrings for a call to be reported." + << endl; + cerr << "\t\t\t--min-sv-length\t\t\t\tminimum length of reported SVs. " + "Default is 25. Values < 25 are ignored." + << endl; + cerr << endl; + cerr << "General options: " << endl; + cerr << "\t--threads\t\t\t\t\t\tsets number of threads, default 4." << endl; + cerr << "\t--version\t\t\t\t\t\tprint version information." << endl; + cerr << "\t--help\t\t\t\t\t\t\tprint this help message." << endl; + cerr << endl; } -int main(int argc, char** argv) { - cerr << "SVDSS, Structural Variant Discovery from Sample-specific Strings." << endl ; - time_t t ; - time(&t) ; - auto c = Configuration::getInstance() ; +int main(int argc, char **argv) { + cerr << "SVDSS, Structural Variant Discovery from Sample-specific Strings." + << endl; + time_t t; + time(&t); + auto c = Configuration::getInstance(); - if (argc == 1) { - print_help(); - exit(1); - } + if (argc == 1) { + print_help(); + exit(1); + } - c->parse(argc, argv) ; + c->parse(argc, argv); - if (c->version) { - cout << "SVDSS, " << version << endl; - exit(0); - } + if (c->version) { + cout << "SVDSS, " << version << endl; + exit(0); + } - if (argc == 1 || c->help) { - print_help() ; - exit(0) ; - } - cerr << "Mode: " << argv[1] << endl; - create_workdir() ; - if (strcmp(argv[1], "call") == 0) { - auto caller = new Caller() ; - caller->run() ; - } else if (strcmp(argv[1], "index") == 0) { - auto pingpong = new PingPong() ; - pingpong->index() ; - } else if (strcmp(argv[1], "search") == 0) { - auto pingpong = new PingPong() ; - pingpong->search() ; - } else if (strcmp(argv[1], "assemble") == 0) { - auto assembler = new Assembler() ; - assembler->run() ; - } else if (strcmp(argv[1], "smooth") == 0) { - auto smoother = new Smoother() ; - smoother->run() ; - } else { - print_help(); - exit(1); - } - time_t s ; - time(&s) ; - lprint({"Complete. Runtime:", to_string(s - t) + " seconds."}) ; + if (argc == 1 || c->help) { + print_help(); + exit(0); + } + cerr << "Mode: " << argv[1] << endl; + create_workdir(); + if (strcmp(argv[1], "call") == 0) { + auto caller = new Caller(); + caller->run(); + } else if (strcmp(argv[1], "index") == 0) { + auto pingpong = new PingPong(); + pingpong->index(); + } else if (strcmp(argv[1], "search") == 0) { + auto pingpong = new PingPong(); + pingpong->search(); + } else if (strcmp(argv[1], "assemble") == 0) { + auto assembler = new Assembler(); + assembler->run(); + } else if (strcmp(argv[1], "smooth") == 0) { + auto smoother = new Smoother(); + smoother->run(); + } else { + print_help(); + exit(1); + } + time_t s; + time(&s); + lprint({"Complete. Runtime:", to_string(s - t) + " seconds."}); - return 0; + return 0; } - diff --git a/ping_pong.cpp b/ping_pong.cpp index ffff46f..8130f84 100644 --- a/ping_pong.cpp +++ b/ping_pong.cpp @@ -1,445 +1,490 @@ #include "ping_pong.hpp" std::string interval2str(rldintv_t sai) { - return "[" + std::to_string(sai.x[0]) + "," + std::to_string(sai.x[1]) + "," + std::to_string(sai.x[2]) + "]" ; + return "[" + std::to_string(sai.x[0]) + "," + std::to_string(sai.x[1]) + "," + + std::to_string(sai.x[2]) + "]"; } static inline int kputsn(const char *p, int l, kstring_t *s) { - if (s->l + l + 1 >= s->m) { - char *tmp; - s->m = s->l + l + 2; - kroundup32(s->m); - if ((tmp = (char*)realloc(s->s, s->m))) s->s = tmp; - else return EOF; - } - memcpy(s->s + s->l, p, l); - s->l += l; - s->s[s->l] = 0; - return l; + if (s->l + l + 1 >= s->m) { + char *tmp; + s->m = s->l + l + 2; + kroundup32(s->m); + if ((tmp = (char *)realloc(s->s, s->m))) + s->s = tmp; + else + return EOF; + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; } void seq_char2nt6(int l, unsigned char *s) { - int i ; - for (i = 0; i < l; ++i) { - s[i] = s[i] < 128? seq_nt6_table[s[i]] : 5 ; - } + int i; + for (i = 0; i < l; ++i) { + s[i] = s[i] < 128 ? seq_nt6_table[s[i]] : 5; + } } -bool PingPong::check_solution(rld_t* index, std::string S) { - int l = S.length() ; - uint8_t *P = (uint8_t*) S.c_str() ; - seq_char2nt6(l, P); - bool found_full = backward_search(index, P, l - 1) ; - bool found_prefix = backward_search(index, P, l - 2) ; - bool found_suffix = backward_search(index, P + 1, l - 2) ; - return !found_full & (found_prefix || found_suffix) ; +bool PingPong::check_solution(rld_t *index, std::string S) { + int l = S.length(); + uint8_t *P = (uint8_t *)S.c_str(); + seq_char2nt6(l, P); + bool found_full = backward_search(index, P, l - 1); + bool found_prefix = backward_search(index, P, l - 2); + bool found_suffix = backward_search(index, P + 1, l - 2); + return !found_full & (found_prefix || found_suffix); } fastq_entry_t PingPong::get_solution(fastq_entry_t fqe, int s, int l) { - std::string S (fqe.seq, s, l) ; - std::string Q (fqe.qual, s, l) ; - return fastq_entry_t(fqe.head, S, Q, s, l) ; + std::string S(fqe.seq, s, l); + std::string Q(fqe.qual, s, l); + return fastq_entry_t(fqe.head, S, Q, s, l); } bool PingPong::backward_search(rld_t *index, const uint8_t *P, int p2) { - rldintv_t sai ; // rldintv_t is the struct used to store a SA interval. - fm6_set_intv(index, P[p2], sai) ; - while (sai.x[2] != 0) { - p2 -= 1 ; - rldintv_t osai[6] ; - rld_extend(index, &sai, osai, 1) ; //1: backward, 0: forward - sai = osai[P[p2]] ; - if (p2 == 0) { - break ; - } + rldintv_t sai; // rldintv_t is the struct used to store a SA interval. + fm6_set_intv(index, P[p2], sai); + while (sai.x[2] != 0) { + p2 -= 1; + rldintv_t osai[6]; + rld_extend(index, &sai, osai, 1); // 1: backward, 0: forward + sai = osai[P[p2]]; + if (p2 == 0) { + break; } - return sai.x[2] != 0 ; + } + return sai.x[2] != 0; } // This will be very fast for smoothed reads -// However non-smoothed reads are going to produce loads of crappy SFS, unless we filter them -void PingPong::ping_pong_search(rld_t *index, uint8_t* P, int l, std::vector& solutions, bool is_smoothed, bam1_t* aln) { - //cout << bam_get_qname(aln) << endl ; - rldintv_t sai ; - int begin = l - 1 ; - bool last_jump = false ; - while (begin >= 0) { - // Backward search. Find a mismatching sequence. Stop at first mismatch. - int bmatches = 0 ; - fm6_set_intv(index, P[begin], sai) ; - //DEBUG(cerr << "BS from " << int2char[P[begin]] << " (" << begin << "): " << interval2str(sai) << endl ;) - bmatches = 0 ; - while (sai.x[2] != 0 && begin > 0) { - begin-- ; - bmatches++ ; - rldintv_t osai[6] ; // output SA intervals (one for each symbol between 0 and 5) - rld_extend(index, &sai, osai, 1) ; - sai = osai[P[begin]] ; - //DEBUG(cerr << "- BE with " << int2char[P[begin]] << " (" << begin << "): " << interval2str(sai) << endl ;) - } - //last sequence was a match - if (begin == 0 && sai.x[2] != 0) { - break ; - } - //DEBUG(cerr << "Mismatch " << int2char[P[begin]] << " (" << begin << "). bmatches: " << to_string(bmatches) << endl ;) - // Forward search: - int end = begin ; - int fmatches = 0 ; - fm6_set_intv(index, P[end], sai) ; - //DEBUG(cerr << "FS from " << int2char[P[end]] << " (" << end << "): " << interval2str(sai) << endl ;) - while(sai.x[2] != 0) { - end++ ; - fmatches++ ; - rldintv_t osai[6] ; - rld_extend(index, &sai, osai, 0) ; - sai = osai[P[end] >= 1 && P[end] <= 4 ? 5 - P[end] : P[end]]; - //DEBUG(cerr << "- FE with " << int2char[P[end]] << " (" << end << "): " << interval2str(sai) << endl ;) - } - //DEBUG(cerr << "Mismatch " << int2char[P[end]] << " (" << end << "). fmatches: " << fmatches << endl ;) - //DEBUG(cerr << "Adding [" << begin << ", " << end << "]." << endl ;) - int sfs_len = end - begin + 1 ; - int acc_len = end - begin + 1 ; - auto sfs = SFS{begin, sfs_len, 1, true} ; - solutions.push_back(sfs) ; - if (begin == 0) { - break ; - } - if (config->overlap == 0) { // Relaxed - begin -= 1 ; - } else { - begin = end + config->overlap ; // overlap < 0 - } +// However non-smoothed reads are going to produce loads of crappy SFS, unless +// we filter them +void PingPong::ping_pong_search(rld_t *index, uint8_t *P, int l, + std::vector &solutions, + bool is_smoothed, bam1_t *aln) { + // cout << bam_get_qname(aln) << endl ; + rldintv_t sai; + int begin = l - 1; + bool last_jump = false; + while (begin >= 0) { + // Backward search. Find a mismatching sequence. Stop at first mismatch. + int bmatches = 0; + fm6_set_intv(index, P[begin], sai); + // DEBUG(cerr << "BS from " << int2char[P[begin]] << " (" << begin << "): " + // << interval2str(sai) << endl ;) + bmatches = 0; + while (sai.x[2] != 0 && begin > 0) { + begin--; + bmatches++; + rldintv_t + osai[6]; // output SA intervals (one for each symbol between 0 and 5) + rld_extend(index, &sai, osai, 1); + sai = osai[P[begin]]; + // DEBUG(cerr << "- BE with " << int2char[P[begin]] << " (" << begin << + // "): " << interval2str(sai) << endl ;) + } + // last sequence was a match + if (begin == 0 && sai.x[2] != 0) { + break; } - //DEBUG(std::this_thread::sleep_for(std::chrono::seconds(2)) ;) - //std::this_thread::sleep_for(std::chrono::seconds(1)) ; + // DEBUG(cerr << "Mismatch " << int2char[P[begin]] << " (" << begin << "). + // bmatches: " << to_string(bmatches) << endl ;) + // Forward search: + int end = begin; + int fmatches = 0; + fm6_set_intv(index, P[end], sai); + // DEBUG(cerr << "FS from " << int2char[P[end]] << " (" << end << "): " << + // interval2str(sai) << endl ;) + while (sai.x[2] != 0) { + end++; + fmatches++; + rldintv_t osai[6]; + rld_extend(index, &sai, osai, 0); + sai = osai[P[end] >= 1 && P[end] <= 4 ? 5 - P[end] : P[end]]; + // DEBUG(cerr << "- FE with " << int2char[P[end]] << " (" << end << "): " + // << interval2str(sai) << endl ;) + } + // DEBUG(cerr << "Mismatch " << int2char[P[end]] << " (" << end << "). + // fmatches: " << fmatches << endl ;) DEBUG(cerr << "Adding [" << begin << + // ", " << end << "]." << endl ;) + int sfs_len = end - begin + 1; + int acc_len = end - begin + 1; + auto sfs = SFS{begin, sfs_len, 1, true}; + solutions.push_back(sfs); + if (begin == 0) { + break; + } + if (config->overlap == 0) { // Relaxed + begin -= 1; + } else { + begin = end + config->overlap; // overlap < 0 + } + } + // DEBUG(std::this_thread::sleep_for(std::chrono::seconds(2)) ;) + // std::this_thread::sleep_for(std::chrono::seconds(1)) ; } bool PingPong::load_batch_bam(int threads, int batch_size, int p) { - int i = 0 ; - int n = 0 ; - while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { - auto alignment = bam_entries[p][n % threads][i] ; - if (alignment == nullptr) { - break ; - } - reads_processed += 1 ; - if (alignment->core.flag & BAM_FUNMAP || alignment->core.flag & BAM_FSUPPLEMENTARY || alignment->core.flag & BAM_FSECONDARY) { - continue ; - } - if (alignment->core.l_qseq < 100) { - continue ; - } - if (alignment->core.tid < 0) { - continue ; - } - uint32_t l = alignment->core.l_qseq ; //length of the read - if (read_seq_max_lengths[p][n % threads][i] < l) { - free(read_seqs[p][n % threads][i]) ; - read_seqs[p][n % threads][i] = (uint8_t*) malloc(sizeof(char) * (l + 1)) ; - read_seq_max_lengths[p][n % threads][i] = l ; - } - read_seq_lengths[p][n % threads][i] = l ; - uint8_t *q = bam_get_seq(alignment) ; - for (int _ = 0; _ < l; _++){ - read_seqs[p][n % threads][i][_] = seq_nt16_str[bam_seqi(q, _)] ; - } - read_seqs[p][n % threads][i][l] = '\0' ; - seq_char2nt6(l, read_seqs[p][n % threads][i]) ; // convert to integers - n += 1 ; - if (n % threads == 0) { - i += 1 ; - } - if (n == batch_size) { - return true ; - } + int i = 0; + int n = 0; + while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { + auto alignment = bam_entries[p][n % threads][i]; + if (alignment == nullptr) { + break; + } + reads_processed += 1; + if (alignment->core.flag & BAM_FUNMAP || + alignment->core.flag & BAM_FSUPPLEMENTARY || + alignment->core.flag & BAM_FSECONDARY) { + continue; + } + if (alignment->core.l_qseq < 100) { + continue; + } + if (alignment->core.tid < 0) { + continue; + } + uint32_t l = alignment->core.l_qseq; // length of the read + if (read_seq_max_lengths[p][n % threads][i] < l) { + free(read_seqs[p][n % threads][i]); + read_seqs[p][n % threads][i] = (uint8_t *)malloc(sizeof(char) * (l + 1)); + read_seq_max_lengths[p][n % threads][i] = l; } - //lprint({"Loaded", to_string(n), "BAM reads.."}); - return n != 0 ? true : false ; + read_seq_lengths[p][n % threads][i] = l; + uint8_t *q = bam_get_seq(alignment); + for (int _ = 0; _ < l; _++) { + read_seqs[p][n % threads][i][_] = seq_nt16_str[bam_seqi(q, _)]; + } + read_seqs[p][n % threads][i][l] = '\0'; + seq_char2nt6(l, read_seqs[p][n % threads][i]); // convert to integers + n += 1; + if (n % threads == 0) { + i += 1; + } + if (n == batch_size) { + return true; + } + } + // lprint({"Loaded", to_string(n), "BAM reads.."}); + return n != 0 ? true : false; } bool PingPong::load_batch_fastq(int threads, int batch_size, int p) { - for (int i = 0; i < threads; i++) { - fastq_entries[p][i].clear() ; - } - int k = 0 ; - int i = 0 ; - int n = 0 ; - while ((k = kseq_read(fastq_iterator)) >= 0) { - if (fastq_iterator->qual.l) { - fastq_entries[p][n % threads].push_back(fastq_entry_t(fastq_iterator->name.s, fastq_iterator->seq.s, fastq_iterator->qual.s)) ; - } else { - fastq_entries[p][n % threads].push_back(fastq_entry_t(fastq_iterator->name.s, fastq_iterator->seq.s, fastq_iterator->name.s)) ; - } - int l = fastq_entries[p][n % threads][i].seq.size() ; - if (read_seq_max_lengths[p][n % threads][i] < l) { - free(read_seqs[p][n % threads][i]) ; - read_seqs[p][n % threads][i] = (uint8_t*) malloc(sizeof(char) * (l + 1)) ; - read_seq_max_lengths[p][n % threads][i] = l ; - } - read_seq_lengths[p][n % threads][i] = l ; - for (int _ = 0; _ < l; _++){ - read_seqs[p][n % threads][i][_] = fastq_entries[p][n % threads][i].seq[_] ; - } - read_seqs[p][n % threads][i][l] = '\0' ; - seq_char2nt6(l, read_seqs[p][n % threads][i]) ; // convert to integers - read_names[p][n % threads][i] = fastq_iterator->name.s ; - n += 1 ; - if (n % threads == 0) { - i += 1 ; - } - if (n == batch_size) { - return true ; - } + for (int i = 0; i < threads; i++) { + fastq_entries[p][i].clear(); + } + int k = 0; + int i = 0; + int n = 0; + while ((k = kseq_read(fastq_iterator)) >= 0) { + if (fastq_iterator->qual.l) { + fastq_entries[p][n % threads].push_back( + fastq_entry_t(fastq_iterator->name.s, fastq_iterator->seq.s, + fastq_iterator->qual.s)); + } else { + fastq_entries[p][n % threads].push_back( + fastq_entry_t(fastq_iterator->name.s, fastq_iterator->seq.s, + fastq_iterator->name.s)); + } + int l = fastq_entries[p][n % threads][i].seq.size(); + if (read_seq_max_lengths[p][n % threads][i] < l) { + free(read_seqs[p][n % threads][i]); + read_seqs[p][n % threads][i] = (uint8_t *)malloc(sizeof(char) * (l + 1)); + read_seq_max_lengths[p][n % threads][i] = l; } - //lprint({"Loaded", to_string(n), "FASTQ reads.."}); - return n != 0 ? true : false ; + read_seq_lengths[p][n % threads][i] = l; + for (int _ = 0; _ < l; _++) { + read_seqs[p][n % threads][i][_] = fastq_entries[p][n % threads][i].seq[_]; + } + read_seqs[p][n % threads][i][l] = '\0'; + seq_char2nt6(l, read_seqs[p][n % threads][i]); // convert to integers + read_names[p][n % threads][i] = fastq_iterator->name.s; + n += 1; + if (n % threads == 0) { + i += 1; + } + if (n == batch_size) { + return true; + } + } + // lprint({"Loaded", to_string(n), "FASTQ reads.."}); + return n != 0 ? true : false; } -batch_type_t PingPong::process_batch(rld_t* index, int p, int i) { - batch_type_t solutions ; - // store read id once for all strings to save space, is it worth it? - if (mode == 0) { - for (int j = 0; j < read_seqs[p][i].size(); j++) { - if (read_seq_lengths[p][i][j] < 0) - // No need here (since ping_pong_search won't crash is #reads is smaller than batch size), but let's keep this - break; - ping_pong_search(index, read_seqs[p][i][j], read_seq_lengths[p][i][j], solutions[read_names[p][i][j]], false, nullptr) ; - } - } else { - for (int j = 0; j < read_seqs[p][i].size(); j++) { - if (read_seq_lengths[p][i][j] < 0) - // Avoid crash at next line when #alignments is smaller than batch size - break; - char *qname = bam_get_qname(bam_entries[p][i][j]) ; - bool is_smoothed = smoothed_reads.find(qname) != smoothed_reads.end() ; - if (config->putative) { - if (ignored_reads.find(qname) != ignored_reads.end()) { - continue ; - } - // was not ignored, so either it's smoothed or not: - if (!is_smoothed) { - continue ; - } - } - ping_pong_search(index, read_seqs[p][i][j], read_seq_lengths[p][i][j], solutions[qname], is_smoothed, bam_entries[p][i][j]) ; - } +batch_type_t PingPong::process_batch(rld_t *index, int p, int i) { + batch_type_t solutions; + // store read id once for all strings to save space, is it worth it? + if (mode == 0) { + for (int j = 0; j < read_seqs[p][i].size(); j++) { + if (read_seq_lengths[p][i][j] < 0) + // No need here (since ping_pong_search won't crash is #reads is smaller + // than batch size), but let's keep this + break; + ping_pong_search(index, read_seqs[p][i][j], read_seq_lengths[p][i][j], + solutions[read_names[p][i][j]], false, nullptr); + } + } else { + for (int j = 0; j < read_seqs[p][i].size(); j++) { + if (read_seq_lengths[p][i][j] < 0) + // Avoid crash at next line when #alignments is smaller than batch size + break; + char *qname = bam_get_qname(bam_entries[p][i][j]); + bool is_smoothed = smoothed_reads.find(qname) != smoothed_reads.end(); + if (config->putative) { + if (ignored_reads.find(qname) != ignored_reads.end()) { + continue; + } + // was not ignored, so either it's smoothed or not: + if (!is_smoothed) { + continue; + } + } + ping_pong_search(index, read_seqs[p][i][j], read_seq_lengths[p][i][j], + solutions[qname], is_smoothed, bam_entries[p][i][j]); } - return solutions ; + } + return solutions; } void PingPong::output_batch(int b) { - auto c = Configuration::getInstance(); - string path = c->workdir + "/solution_batch_" + std::to_string(current_batch) + (c->assemble ? ".assembled" : "") + ".sfs"; - //cerr << "Outputting to " << path << ".." << "\r" ; - std::ofstream o(path); - bool isreversed = config->bam != "" ; - uint64_t n = 0 ; - for (int i = last_dumped_batch; i < b; i++) { // for each of the unmerged batches - for (auto &batch: batches[i]) { // for each thread in batch - for (auto &read: batch) { // for each read in thread - if (c->assemble) { - Assembler a = Assembler(); - vector assembled_SFSs = a.assemble(read.second); - bool is_first = true; - for (const SFS &sfs : assembled_SFSs) { - o << (is_first ? read.first : "*") << "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c << "\t" << isreversed << endl; - is_first = false; - } - } else { - bool is_first = true; - for (auto &sfs: read.second) { // for each sfs in read - // optimize file output size by not outputing read name for every SFS - o << (is_first ? read.first : "*") << "\t" << sfs.s << "\t" << sfs.l << "\t" << sfs.c <<"\t" << isreversed << endl ; - is_first = false; - n += 1 ; - } - } - } - batch.clear() ; - } - batches[i].clear() ; + auto c = Configuration::getInstance(); + string path = c->workdir + "/solution_batch_" + + std::to_string(current_batch) + + (c->assemble ? ".assembled" : "") + ".sfs"; + // cerr << "Outputting to " << path << ".." << "\r" ; + std::ofstream o(path); + bool isreversed = config->bam != ""; + uint64_t n = 0; + for (int i = last_dumped_batch; i < b; + i++) { // for each of the unmerged batches + for (auto &batch : batches[i]) { // for each thread in batch + for (auto &read : batch) { // for each read in thread + if (c->assemble) { + Assembler a = Assembler(); + vector assembled_SFSs = a.assemble(read.second); + bool is_first = true; + for (const SFS &sfs : assembled_SFSs) { + o << (is_first ? read.first : "*") << "\t" << sfs.s << "\t" << sfs.l + << "\t" << sfs.c << "\t" << isreversed << endl; + is_first = false; + } + } else { + bool is_first = true; + for (auto &sfs : read.second) { // for each sfs in read + // optimize file output size by not outputing read name for every + // SFS + o << (is_first ? read.first : "*") << "\t" << sfs.s << "\t" << sfs.l + << "\t" << sfs.c << "\t" << isreversed << endl; + is_first = false; + n += 1; + } + } + } + batch.clear(); } - // this is actually the first batch to output next time - last_dumped_batch = b ; + batches[i].clear(); + } + // this is actually the first batch to output next time + last_dumped_batch = b; } int PingPong::search() { - config = Configuration::getInstance() ; - // parse arguments - lprint({"Restoring index.."}); - rld_t *index = rld_restore(config->index.c_str()) ; - lprint({"Done."}); - if (config->bam != "") { - lprint({"BAM input:", config->bam}); - bam_file = hts_open(config->bam.c_str(), "r") ; - bam_header = sam_hdr_read(bam_file) ; - bgzf_mt(bam_file->fp.bgzf, 8, 1) ; - mode = 1 ; - } else if (config->fastq != "") { - lprint({"FASTQ input:", config->fastq}); - fastq_file = gzopen(config->fastq.c_str(), "r") ; - fastq_iterator = kseq_init(fastq_file) ; - mode = 0 ; - } else { - lprint({"No input file provided, aborting.."}, 2); - exit(1) ; - } - if (config->putative) { - lprint({"Putative SFS extraction enabled."}) ; - load_smoothed_read_ids() ; - } - // allocate all necessary stuff - int p = 0 ; - int batch_size = (10000 / config->threads) * config->threads ; - for(int i = 0; i < 2; i++) { - bam_entries.push_back(vector>(config->threads)) ; - fastq_entries.push_back(vector>(config->threads)) ; // current and next output + config = Configuration::getInstance(); + // parse arguments + lprint({"Restoring index.."}); + rld_t *index = rld_restore(config->index.c_str()); + lprint({"Done."}); + if (config->bam != "") { + lprint({"BAM input:", config->bam}); + bam_file = hts_open(config->bam.c_str(), "r"); + bam_header = sam_hdr_read(bam_file); + bgzf_mt(bam_file->fp.bgzf, 8, 1); + mode = 1; + } else if (config->fastq != "") { + lprint({"FASTQ input:", config->fastq}); + fastq_file = gzopen(config->fastq.c_str(), "r"); + fastq_iterator = kseq_init(fastq_file); + mode = 0; + } else { + lprint({"No input file provided, aborting.."}, 2); + exit(1); + } + if (config->putative) { + lprint({"Putative SFS extraction enabled."}); + load_smoothed_read_ids(); + } + // allocate all necessary stuff + int p = 0; + int batch_size = (10000 / config->threads) * config->threads; + for (int i = 0; i < 2; i++) { + bam_entries.push_back(vector>(config->threads)); + fastq_entries.push_back(vector>( + config->threads)); // current and next output + } + // pre-allocate read seqs + for (int i = 0; i < 2; i++) { + read_seqs.push_back( + vector>(config->threads)); // current and next output + read_names.push_back( + vector>(config->threads)); // current and next output + read_seq_lengths.push_back( + vector>(config->threads)); // current and next output + read_seq_max_lengths.push_back( + vector>(config->threads)); // current and next output + for (int j = 0; j < config->threads; j++) { + for (int k = 0; k < batch_size / config->threads; k++) { + read_seqs[i][j].push_back((uint8_t *)malloc(sizeof(uint8_t) * (30001))); + read_names[i][j].push_back(""); + read_seq_lengths[i][j].push_back( + -1); // this to flag unused slots in a batch + read_seq_max_lengths[i][j].push_back(30000); + } } - // pre-allocate read seqs + } + lprint({"Loading first batch.."}); + if (mode == 0) { + load_batch_fastq(config->threads, batch_size, p); + } else { for (int i = 0; i < 2; i++) { - read_seqs.push_back(vector>(config->threads)) ; // current and next output - read_names.push_back(vector>(config->threads)) ; // current and next output - read_seq_lengths.push_back(vector>(config->threads)) ; // current and next output - read_seq_max_lengths.push_back(vector>(config->threads)) ; // current and next output - for (int j = 0; j < config->threads; j++) { - for (int k = 0; k < batch_size / config->threads; k++) { - read_seqs[i][j].push_back((uint8_t*) malloc(sizeof(uint8_t) * (30001))) ; - read_names[i][j].push_back("") ; - read_seq_lengths[i][j].push_back(-1) ; // this to flag unused slots in a batch - read_seq_max_lengths[i][j].push_back(30000) ; - } + for (int j = 0; j < config->threads; j++) { + for (int k = 0; k < batch_size / config->threads; k++) { + bam_entries[i][j].push_back(bam_init1()); } + } } - lprint({"Loading first batch.."}); - if (mode == 0) { - load_batch_fastq(config->threads, batch_size, p) ; - } else { - for (int i = 0; i < 2; i++) { - for (int j = 0; j < config->threads; j++) { - for (int k = 0; k < batch_size / config->threads; k++) { - bam_entries[i][j].push_back(bam_init1()) ; - } - } - } - load_batch_bam(config->threads, batch_size, p) ; - } - batches.push_back(vector(config->threads)) ; // previous and current output - // main loop - time_t f ; - time(&f) ; - int b = 0 ; - uint64_t u = 0 ; - - bool should_load = true ; - bool should_process = true ; - bool loaded_last_batch = false ; - bool should_update_current_batch = false ; - - uint64_t total_sfs = 0 ; - uint64_t total_sfs_output_batch = 0 ; - lprint({"Extracting SFS strings on", to_string(config->threads), "threads.."}); - - //#pragma omp parallel num_threads(config->threads + 2) - while (should_process) { - //#pragma omp single - { - if (!should_load) { - should_process = false ; - } - if (loaded_last_batch) { - should_load = false ; - } - } - //#pragma omp for - #pragma omp parallel for num_threads(config->threads + 2) schedule(static, 1) - for(int i = 0; i < config->threads + 2; i++) { - int t = omp_get_thread_num() ; - if (t == 0) { - // load next batch of entries - if (should_load) { - if (mode == 1) { - loaded_last_batch = !load_batch_bam(config->threads, batch_size, (p + 1) % 2) ; - } else { - loaded_last_batch = !load_batch_fastq(config->threads, batch_size, (p + 1) % 2) ; - } - //lprint({"Loaded."}); - batches.push_back(vector(config->threads)) ; // previous and current output - } - } else if (t == 1) { - if (b >= 1) { - // just count how many strings we have - uint64_t c = 0 ; - for (const auto &batch: batches[b - 1]) { - for (auto it = batch.begin(); it != batch.end(); it++) { - c += it->second.size() ; - } - } - total_sfs += c ; - total_sfs_output_batch += c ; - // cerr << "[I] Merged " << c << " new sequences. " << total_sfs << " total sequences." << endl ; - // reached memory limit or last pipeline run - if (total_sfs_output_batch >= 10000000 || !should_process) { - output_batch(b) ; - total_sfs_output_batch = 0 ; - current_batch += 1 ; - } - } - } else { - if (should_process) { - batches[b][t - 2] = process_batch(index, p, t - 2) ; - } - } - } - //#pragma omp single - { - if (!should_load) { - //lprint({"Processed last batch of inputs."}); - } - if (!should_process) { - //break ; - } - - p += 1 ; - p %= 2 ; - b += 1 ; - time_t s ; - time(&s) ; - if (s - f == 0) { - s = f + 1 ; + load_batch_bam(config->threads, batch_size, p); + } + batches.push_back( + vector(config->threads)); // previous and current output + // main loop + time_t f; + time(&f); + int b = 0; + uint64_t u = 0; + + bool should_load = true; + bool should_process = true; + bool loaded_last_batch = false; + bool should_update_current_batch = false; + + uint64_t total_sfs = 0; + uint64_t total_sfs_output_batch = 0; + lprint( + {"Extracting SFS strings on", to_string(config->threads), "threads.."}); + + //#pragma omp parallel num_threads(config->threads + 2) + while (should_process) { + //#pragma omp single + { + if (!should_load) { + should_process = false; + } + if (loaded_last_batch) { + should_load = false; + } + } +//#pragma omp for +#pragma omp parallel for num_threads(config->threads + 2) schedule(static, 1) + for (int i = 0; i < config->threads + 2; i++) { + int t = omp_get_thread_num(); + if (t == 0) { + // load next batch of entries + if (should_load) { + if (mode == 1) { + loaded_last_batch = + !load_batch_bam(config->threads, batch_size, (p + 1) % 2); + } else { + loaded_last_batch = + !load_batch_fastq(config->threads, batch_size, (p + 1) % 2); + } + // lprint({"Loaded."}); + batches.push_back(vector( + config->threads)); // previous and current output + } + } else if (t == 1) { + if (b >= 1) { + // just count how many strings we have + uint64_t c = 0; + for (const auto &batch : batches[b - 1]) { + for (auto it = batch.begin(); it != batch.end(); it++) { + c += it->second.size(); } - cerr << "[I] Processed batch " << b << ". Reads so far " << reads_processed << ". Reads per second: " << reads_processed / (s - f) << ". SFS extracted so far: " << total_sfs << ". Batches exported: " << current_batch << ". Time: " << s - f << "\r" ; - } + } + total_sfs += c; + total_sfs_output_batch += c; + // cerr << "[I] Merged " << c << " new sequences. " << total_sfs << " + // total sequences." << endl ; reached memory limit or last pipeline + // run + if (total_sfs_output_batch >= 10000000 || !should_process) { + output_batch(b); + total_sfs_output_batch = 0; + current_batch += 1; + } + } + } else { + if (should_process) { + batches[b][t - 2] = process_batch(index, p, t - 2); + } + } + } + //#pragma omp single + { + if (!should_load) { + // lprint({"Processed last batch of inputs."}); + } + if (!should_process) { + // break ; + } + + p += 1; + p %= 2; + b += 1; + time_t s; + time(&s); + if (s - f == 0) { + s = f + 1; + } + cerr << "[I] Processed batch " << b << ". Reads so far " + << reads_processed + << ". Reads per second: " << reads_processed / (s - f) + << ". SFS extracted so far: " << total_sfs + << ". Batches exported: " << current_batch << ". Time: " << s - f + << "\r"; } - cerr << endl ; - lprint({"Done."}) ; - cout << non_x_reads << " processed." << endl ; - // cleanup - kseq_destroy(fastq_iterator) ; - gzclose(fastq_file) ; - num_output_batches = current_batch ; - return u ; + } + cerr << endl; + lprint({"Done."}); + cout << non_x_reads << " processed." << endl; + // cleanup + kseq_destroy(fastq_iterator); + gzclose(fastq_file); + num_output_batches = current_batch; + return u; } void PingPong::load_smoothed_read_ids() { - lprint({"Loading smoothed read ids.."}) ; - ifstream ignore_file(config->workdir + "/ignored_reads.txt") ; - if (ignore_file.is_open()) { - string read_name; - while (getline(ignore_file, read_name)) { - ignored_reads[read_name] = true ; - } - ignore_file.close() ; + lprint({"Loading smoothed read ids.."}); + ifstream ignore_file(config->workdir + "/ignored_reads.txt"); + if (ignore_file.is_open()) { + string read_name; + while (getline(ignore_file, read_name)) { + ignored_reads[read_name] = true; } - ifstream in_file(config->workdir + "/smoothed_reads.txt") ; - if (in_file.is_open()) { - string read_name; - while (getline(in_file, read_name)) { - smoothed_reads[read_name] = true ; - } - in_file.close() ; + ignore_file.close(); + } + ifstream in_file(config->workdir + "/smoothed_reads.txt"); + if (in_file.is_open()) { + string read_name; + while (getline(in_file, read_name)) { + smoothed_reads[read_name] = true; } - lprint({"Loaded", to_string(ignored_reads.size()), "ignored read ids."}) ; - lprint({"Loaded", to_string(smoothed_reads.size()), "smoothed read ids."}) ; + in_file.close(); + } + lprint({"Loaded", to_string(ignored_reads.size()), "ignored read ids."}); + lprint({"Loaded", to_string(smoothed_reads.size()), "smoothed read ids."}); } // ============================================================================= \\ @@ -448,117 +493,122 @@ void PingPong::load_smoothed_read_ids() { /** Code adapted from ropebwt2 (main_ropebwt2 in main.c) **/ int PingPong::index() { - auto c = Configuration::getInstance() ; - // hardcoded parameters - uint64_t m = (uint64_t)(.97 * 10 * 1024 * 1024 * 1024) + 1 ; // batch size for multi-string indexing - int block_len = ROPE_DEF_BLOCK_LEN, max_nodes = ROPE_DEF_MAX_NODES, so = MR_SO_RCLO ; - int thr_min = 100 ; // switch to single thread when < 100 strings remain in a batch - - // the index - mrope_t *mr = 0 ; - - bool binary_output = c->binary ; - if (c->append != "") { - FILE *fp ; - lprint({"Appending index:", c->append}); - if ((fp = fopen(c->append.c_str(), "rb")) == 0) { - lprint({"Failed to open file", c->append}, 2); - return 1 ; - } - mr = mr_restore(fp) ; - fclose(fp) ; - } - - // Initialize mr if not restored - if (mr == 0) mr = mr_init(max_nodes, block_len, so) ; - mr_thr_min(mr, thr_min) ; - - // Parsing the input sample - gzFile fp; - if (c->fasta != "") - fp = gzopen(c->fasta.c_str(), "rb") ; - else if (c->fastq != "") - fp = gzopen(c->fastq.c_str(), "rb") ; - else { - cerr << "Please provide a FASTA/Q file. Halting.." << endl; - exit(1); - } - kseq_t *ks = kseq_init(fp) ; - kstring_t buf = { 0, 0, 0 } ; // buffer, will contain the concatenation - int l ; - uint8_t *s ; - int i ; - while ((l = kseq_read(ks)) >= 0) { - s = (uint8_t*)ks->seq.s ; - - // change encoding - for (i = 0; i < l; ++i) { - s[i] = s[i] < 128? seq_nt6_table[s[i]] : 5 ; - } - - // Reverse the sequence - for (i = 0; i < l>>1; ++i) { - int tmp = s[l-1-i] ; - s[l-1-i] = s[i] ; - s[i] = tmp ; - } + auto c = Configuration::getInstance(); + // hardcoded parameters + uint64_t m = (uint64_t)(.97 * 10 * 1024 * 1024 * 1024) + + 1; // batch size for multi-string indexing + int block_len = ROPE_DEF_BLOCK_LEN, max_nodes = ROPE_DEF_MAX_NODES, + so = MR_SO_RCLO; + int thr_min = + 100; // switch to single thread when < 100 strings remain in a batch + + // the index + mrope_t *mr = 0; + + bool binary_output = c->binary; + if (c->append != "") { + FILE *fp; + lprint({"Appending index:", c->append}); + if ((fp = fopen(c->append.c_str(), "rb")) == 0) { + lprint({"Failed to open file", c->append}, 2); + return 1; + } + mr = mr_restore(fp); + fclose(fp); + } + + // Initialize mr if not restored + if (mr == 0) + mr = mr_init(max_nodes, block_len, so); + mr_thr_min(mr, thr_min); + + // Parsing the input sample + gzFile fp; + if (c->fasta != "") + fp = gzopen(c->fasta.c_str(), "rb"); + else if (c->fastq != "") + fp = gzopen(c->fastq.c_str(), "rb"); + else { + cerr << "Please provide a FASTA/Q file. Halting.." << endl; + exit(1); + } + kseq_t *ks = kseq_init(fp); + kstring_t buf = {0, 0, 0}; // buffer, will contain the concatenation + int l; + uint8_t *s; + int i; + while ((l = kseq_read(ks)) >= 0) { + s = (uint8_t *)ks->seq.s; + + // change encoding + for (i = 0; i < l; ++i) { + s[i] = s[i] < 128 ? seq_nt6_table[s[i]] : 5; + } - // Add forward to buffer - kputsn((char*)ks->seq.s, ks->seq.l + 1, &buf) ; + // Reverse the sequence + for (i = 0; i < l >> 1; ++i) { + int tmp = s[l - 1 - i]; + s[l - 1 - i] = s[i]; + s[i] = tmp; + } - // Add reverse to buffer - for (i = 0; i < l>>1; ++i) { - int tmp = s[l - 1 - i] ; - tmp = (tmp >= 1 && tmp <= 4)? 5 - tmp : tmp ; - s[l - 1 - i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i] ; - s[i] = tmp ; - } - if (l&1) s[i] = (s[i] >= 1 && s[i] <= 4)? 5 - s[i] : s[i] ; - kputsn((char*)ks->seq.s, ks->seq.l + 1, &buf) ; + // Add forward to buffer + kputsn((char *)ks->seq.s, ks->seq.l + 1, &buf); - if(buf.l >= m) { - mr_insert_multi(mr, buf.l, (uint8_t*)buf.s, 1) ; - buf.l = 0 ; - } + // Add reverse to buffer + for (i = 0; i < l >> 1; ++i) { + int tmp = s[l - 1 - i]; + tmp = (tmp >= 1 && tmp <= 4) ? 5 - tmp : tmp; + s[l - 1 - i] = (s[i] >= 1 && s[i] <= 4) ? 5 - s[i] : s[i]; + s[i] = tmp; } + if (l & 1) + s[i] = (s[i] >= 1 && s[i] <= 4) ? 5 - s[i] : s[i]; + kputsn((char *)ks->seq.s, ks->seq.l + 1, &buf); - if (buf.l) { // last batch - mr_insert_multi(mr, buf.l, (uint8_t*)buf.s, 1) ; + if (buf.l >= m) { + mr_insert_multi(mr, buf.l, (uint8_t *)buf.s, 1); + buf.l = 0; } - - free(buf.s) ; - kseq_destroy(ks) ; - gzclose(fp) ; - - // dump index to stdout - if (binary_output) { - // binary FMR format - mr_dump(mr, fopen(c->index.c_str(), "wb")) ; - } else { - // FMD format - mritr_t itr ; - const uint8_t *block ; - rld_t *e = 0 ; - rlditr_t di ; - e = rld_init(6, 3) ; - rld_itr_init(e, &di, 0) ; - mr_itr_first(mr, &itr, 1) ; - while ((block = mr_itr_next_block(&itr)) != 0) { - const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block) ; - while (q < end) { - int c = 0 ; - int64_t l ; - rle_dec1(q, c, l) ; - rld_enc(e, &di, l, c) ; - } - } - rld_enc_finish(e, &di) ; - rld_dump(e, c->index.c_str()) ; + } + + if (buf.l) { // last batch + mr_insert_multi(mr, buf.l, (uint8_t *)buf.s, 1); + } + + free(buf.s); + kseq_destroy(ks); + gzclose(fp); + + // dump index to stdout + if (binary_output) { + // binary FMR format + mr_dump(mr, fopen(c->index.c_str(), "wb")); + } else { + // FMD format + mritr_t itr; + const uint8_t *block; + rld_t *e = 0; + rlditr_t di; + e = rld_init(6, 3); + rld_itr_init(e, &di, 0); + mr_itr_first(mr, &itr, 1); + while ((block = mr_itr_next_block(&itr)) != 0) { + const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block); + while (q < end) { + int c = 0; + int64_t l; + rle_dec1(q, c, l); + rld_enc(e, &di, l, c); + } } + rld_enc_finish(e, &di); + rld_dump(e, c->index.c_str()); + } - mr_destroy(mr) ; + mr_destroy(mr); - return 0 ; + return 0; } // ============================================================================= \\ @@ -566,29 +616,31 @@ int PingPong::index() { // ============================================================================= \\ bool PingPong::query(string q) { - config = Configuration::getInstance() ; - rld_t *index = rld_restore(config->index.c_str()) ; - // parse arguments - lprint({"Restoring index.."}); - int l = q.length() ; - uint8_t *p = (uint8_t*) q.c_str() ; - seq_char2nt6(l, p); - bool found_full = backward_search(index, p, l - 1) ; - bool found_prefix = backward_search(index, p, l - 2) ; - bool found_suffix = backward_search(index, p + 1, l - 2) ; - auto is_sfs = !found_full && (found_prefix || found_suffix) ; - cout << "Exact match: " << (found_full ? "yes" : "no") << endl ; - cout << "Prefix match: " << (found_prefix ? "yes" : "no") << endl ; - cout << "Suffix match: " << (found_suffix ? "yes" : "no") << endl ; - if (!is_sfs) { - load_chromosomes(config->reference) ; - char* s = nullptr ; - for (auto chrom = chromosome_seqs.begin(); chrom != chromosome_seqs.end(); chrom++) { - s = strstr(chromosome_seqs[chrom->first], q.c_str()) ; - if (s != nullptr) { - cout << chrom->first << ": Found at " << s - chromosome_seqs[chrom->first] << endl ; - } - } + config = Configuration::getInstance(); + rld_t *index = rld_restore(config->index.c_str()); + // parse arguments + lprint({"Restoring index.."}); + int l = q.length(); + uint8_t *p = (uint8_t *)q.c_str(); + seq_char2nt6(l, p); + bool found_full = backward_search(index, p, l - 1); + bool found_prefix = backward_search(index, p, l - 2); + bool found_suffix = backward_search(index, p + 1, l - 2); + auto is_sfs = !found_full && (found_prefix || found_suffix); + cout << "Exact match: " << (found_full ? "yes" : "no") << endl; + cout << "Prefix match: " << (found_prefix ? "yes" : "no") << endl; + cout << "Suffix match: " << (found_suffix ? "yes" : "no") << endl; + if (!is_sfs) { + load_chromosomes(config->reference); + char *s = nullptr; + for (auto chrom = chromosome_seqs.begin(); chrom != chromosome_seqs.end(); + chrom++) { + s = strstr(chromosome_seqs[chrom->first], q.c_str()); + if (s != nullptr) { + cout << chrom->first << ": Found at " + << s - chromosome_seqs[chrom->first] << endl; + } } - return is_sfs ; + } + return is_sfs; } diff --git a/ping_pong.hpp b/ping_pong.hpp index 1e3d84d..95a1f20 100644 --- a/ping_pong.hpp +++ b/ping_pong.hpp @@ -1,120 +1,118 @@ #ifndef PNG_HPP #define PNG_HPP -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include -#include #include +#include #include +#include +#include +#include #include +#include +#include -#include "rle.h" -#include "rld0.h" #include "mrope.h" +#include "rld0.h" +#include "rle.h" -#include -#include -#include -#include -#include #include "htslib/hfile.h" #include "htslib/hts_endian.h" +#include +#include +#include +#include +#include +#include "assembler.hpp" #include "bam.hpp" -#include "sfs.hpp" -#include "fastq.hpp" +#include "chromosomes.hpp" #include "config.hpp" +#include "fastq.hpp" #include "lprint.hpp" -#include "assembler.hpp" -#include "chromosomes.hpp" +#include "sfs.hpp" -using namespace std ; +using namespace std; #ifdef DEBUG_MODE -# define DEBUG(x) x -# define NEBUG(x) +#define DEBUG(x) x +#define NEBUG(x) #else -# define DEBUG(x) -# define NEBUG(x) x +#define DEBUG(x) +#define NEBUG(x) x #endif -#define fm6_comp(a) ((a) >= 1 && (a) <= 4? 5 - (a) : (a)) +#define fm6_comp(a) ((a) >= 1 && (a) <= 4 ? 5 - (a) : (a)) -#define fm6_set_intv(e, c, ik) ((ik).x[0] = (e)->cnt[(int)(c)], (ik).x[2] = (e)->cnt[(int)(c)+1] - (e)->cnt[(int)(c)], (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0) +#define fm6_set_intv(e, c, ik) \ + ((ik).x[0] = (e)->cnt[(int)(c)], \ + (ik).x[2] = (e)->cnt[(int)(c) + 1] - (e)->cnt[(int)(c)], \ + (ik).x[1] = (e)->cnt[fm6_comp(c)], (ik).info = 0) /** From ropebwt2 ********/ static unsigned char seq_nt6_table[128] = { - 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 -} ; + 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, + 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; -typedef SFS sfs_type_t ; -typedef std::map> batch_type_t; +typedef SFS sfs_type_t; +typedef std::map> batch_type_t; -static const std::vector int2char ({"$", "A", "C", "G", "T", "N"}) ; +static const std::vector int2char({"$", "A", "C", "G", "T", "N"}); class PingPong { public: + int index(); + int search(); + bool query(std::string); - int index() ; - int search() ; - bool query(std::string) ; - - int num_output_batches ; + int num_output_batches; private: - - Configuration* config ; - - int mode ; - int current_batch = 0 ; - int last_dumped_batch = 0 ; - int reads_processed = 0 ; - int non_x_reads = 0 ; - - gzFile fastq_file ; - kseq_t* fastq_iterator ; - samFile *bam_file ; - bam_hdr_t *bam_header ; - - unordered_map smoothed_reads ; - unordered_map ignored_reads ; - void load_smoothed_read_ids() ; - - std::vector>> read_seq_lengths ; - std::vector>> read_seq_max_lengths ; - std::vector>> read_seqs ; - std::vector>> read_names ; - std::vector>> bam_entries ; - std::vector>> fastq_entries ; - bool load_batch_bam(int threads, int batch_size, int p) ; - bool load_batch_fastq(int threads, int batch_size, int p) ; - batch_type_t process_batch(rld_t* index, int p, int i) ; - void ping_pong_search(rld_t *index, uint8_t* seq, int l, std::vector& solutions, bool is_smoothed, bam1_t*) ; - void output_batch(int) ; - - std::vector> batches ; - - bool check_solution(rld_t* index, std::string S) ; - bool backward_search(rld_t *index, const uint8_t *P, int p2) ; - fastq_entry_t get_solution(fastq_entry_t fqe, int s, int l) ; - -} ; + Configuration *config; + + int mode; + int current_batch = 0; + int last_dumped_batch = 0; + int reads_processed = 0; + int non_x_reads = 0; + + gzFile fastq_file; + kseq_t *fastq_iterator; + samFile *bam_file; + bam_hdr_t *bam_header; + + unordered_map smoothed_reads; + unordered_map ignored_reads; + void load_smoothed_read_ids(); + + std::vector>> read_seq_lengths; + std::vector>> read_seq_max_lengths; + std::vector>> read_seqs; + std::vector>> read_names; + std::vector>> bam_entries; + std::vector>> fastq_entries; + bool load_batch_bam(int threads, int batch_size, int p); + bool load_batch_fastq(int threads, int batch_size, int p); + batch_type_t process_batch(rld_t *index, int p, int i); + void ping_pong_search(rld_t *index, uint8_t *seq, int l, + std::vector &solutions, bool is_smoothed, + bam1_t *); + void output_batch(int); + + std::vector> batches; + + bool check_solution(rld_t *index, std::string S); + bool backward_search(rld_t *index, const uint8_t *P, int p2); + fastq_entry_t get_solution(fastq_entry_t fqe, int s, int l); +}; #endif diff --git a/sfs.cpp b/sfs.cpp index 1e7e6a0..4dc764e 100644 --- a/sfs.cpp +++ b/sfs.cpp @@ -1,32 +1,33 @@ #include "sfs.hpp" -using namespace std ; +using namespace std; bool operator<(const SFS &x, const SFS &y) { return x.s < y.s; } map> parse_sfsfile(const string &sfs_path, int tau) { - map> SFSs; - string line; - ifstream inf(sfs_path); - if (inf.is_open()) { - string info[5]; - string read_name; - while (getline(inf, line)) { - stringstream ssin(line); - int i = 0; - while (ssin.good() && i < 5) { - ssin >> info[i++]; - } - if (stoi(info[3]) < tau) { - continue; - } - if (info[0].compare("*") != 0) { - read_name = info[0]; - SFSs[read_name] = vector(); - } - //TODO - SFSs[read_name].push_back(SFS(stoi(info[1]), stoi(info[2]), stoi(info[3]), true)) ;//info[4].compare("1") == 0)); - } + map> SFSs; + string line; + ifstream inf(sfs_path); + if (inf.is_open()) { + string info[5]; + string read_name; + while (getline(inf, line)) { + stringstream ssin(line); + int i = 0; + while (ssin.good() && i < 5) { + ssin >> info[i++]; + } + if (stoi(info[3]) < tau) { + continue; + } + if (info[0].compare("*") != 0) { + read_name = info[0]; + SFSs[read_name] = vector(); + } + // TODO + SFSs[read_name].push_back(SFS(stoi(info[1]), stoi(info[2]), stoi(info[3]), + true)); // info[4].compare("1") == 0)); } - return SFSs; + } + return SFSs; } diff --git a/sfs.hpp b/sfs.hpp index a375361..bc1e744 100644 --- a/sfs.hpp +++ b/sfs.hpp @@ -1,12 +1,12 @@ #ifndef SFS_HPP #define SFS_HPP +#include +#include #include +#include #include #include -#include -#include -#include static const char RCN[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 @@ -25,71 +25,72 @@ static const char RCN[128] = { }; struct SFS { - uint s ; - uint l ; - uint c ; - bool isreversed; + uint s; + uint l; + uint c; + bool isreversed; - SFS() { - s = 0 ; - l = 0 ; - c = 0 ; - isreversed = false; - } + SFS() { + s = 0; + l = 0; + c = 0; + isreversed = false; + } - SFS(uint s_, uint l_, uint c_, bool isreversed_) { - s = s_ ; - l = l_ ; - c = c_ ; - isreversed = isreversed_; - } + SFS(uint s_, uint l_, uint c_, bool isreversed_) { + s = s_; + l = l_; + c = c_; + isreversed = isreversed_; + } - void reverse(uint p) { s = p - s - l; } + void reverse(uint p) { s = p - s - l; } }; bool operator<(const SFS &, const SFS &); struct ExtSFS { - std::string chrom ; - std::string qname ; - int s ; - int e ; + std::string chrom; + std::string qname; + int s; + int e; - ExtSFS(const std::string& _chrom, const std::string& _qname, int _s, int _e) { - chrom = _chrom ; - qname = _qname ; - s = _s ; - e = _e ; - } + ExtSFS(const std::string &_chrom, const std::string &_qname, int _s, int _e) { + chrom = _chrom; + qname = _qname; + s = _s; + e = _e; + } - bool operator<(const ExtSFS& c) const { - if (chrom == c.chrom) { - return s < c.s ; - } else { - return chrom < c.chrom ; - } + bool operator<(const ExtSFS &c) const { + if (chrom == c.chrom) { + return s < c.s; + } else { + return chrom < c.chrom; } + } - bool operator==(const ExtSFS& c) const { - return chrom == c.chrom and s == c.s and e == c.e ; - } + bool operator==(const ExtSFS &c) const { + return chrom == c.chrom and s == c.s and e == c.e; + } }; class Consensus { public: - std::string seq ; - std::string chrom ; - std::string cigar ; - int s ; - int e ; + std::string seq; + std::string chrom; + std::string cigar; + int s; + int e; - Consensus(const std::string _seq, const std::string _cigar, const std::string _chrom, int _s, int _e) { - seq = _seq ; - cigar = _cigar ; - chrom = _chrom ; - s = _s ; - e = _e ; - } + Consensus(const std::string _seq, const std::string _cigar, + const std::string _chrom, int _s, int _e) { + seq = _seq; + cigar = _cigar; + chrom = _chrom; + s = _s; + e = _e; + } }; std::map> parse_sfsfile(const std::string &, int); diff --git a/smoother.cpp b/smoother.cpp index e5c5351..a39a7b0 100644 --- a/smoother.cpp +++ b/smoother.cpp @@ -1,464 +1,508 @@ #include "smoother.hpp" -using namespace std ; +using namespace std; -//typedef struct bam1_t { -// bam1_core_t core; // won't change -// uint64_t id; // won't change -// uint8_t *data; // has to reallocate -// int l_data; // will change -// uint32_t m_data; // won't change -// uint32_t mempolicy:2, :30 /* Reserved */; -//} bam1_t; +// typedef struct bam1_t { +// bam1_core_t core; // won't change +// uint64_t id; // won't change +// uint8_t *data; // has to reallocate +// int l_data; // will change +// uint32_t m_data; // won't change +// uint32_t mempolicy:2, :30 /* Reserved */; +// } bam1_t; -//typedef struct bam1_core_t { -// hts_pos_t pos; // won't change -// int32_t tid; // won't change -// uint16_t bin; // just copy -// uint8_t qual; // won't change -// uint8_t l_extranul; // won't change -// uint16_t flag; // won't change -// uint16_t l_qname; // won't change -// uint32_t n_cigar; // will change -// int32_t l_qseq; // will change -// int32_t mtid; // just copy -// hts_pos_t mpos; // just copy -// hts_pos_t isize; // may or may ont change? -//} bam1_core_t; +// typedef struct bam1_core_t { +// hts_pos_t pos; // won't change +// int32_t tid; // won't change +// uint16_t bin; // just copy +// uint8_t qual; // won't change +// uint8_t l_extranul; // won't change +// uint16_t flag; // won't change +// uint16_t l_qname; // won't change +// uint32_t n_cigar; // will change +// int32_t l_qseq; // will change +// int32_t mtid; // just copy +// hts_pos_t mpos; // just copy +// hts_pos_t isize; // may or may ont change? +// } bam1_core_t; int do_realloc_bam_data(bam1_t *b, size_t desired) { - //cout << "Extending BAM entry." << endl ; - uint32_t new_m_data ; - uint8_t *new_data ; - new_m_data = desired ; - kroundup32(new_m_data) ; - if (new_m_data < desired) { - errno = ENOMEM ; // Not strictly true but we can't store the size - return -1 ; - } - new_data = (uint8_t*) realloc(b->data, new_m_data) ; - if (!new_data) return -1 ; - b->data = new_data ; - b->m_data = new_m_data ; - return 0 ; + // cout << "Extending BAM entry." << endl ; + uint32_t new_m_data; + uint8_t *new_data; + new_m_data = desired; + kroundup32(new_m_data); + if (new_m_data < desired) { + errno = ENOMEM; // Not strictly true but we can't store the size + return -1; + } + new_data = (uint8_t *)realloc(b->data, new_m_data); + if (!new_data) + return -1; + b->data = new_data; + b->m_data = new_m_data; + return 0; } int realloc_bam_data(bam1_t *b, size_t desired) { - if (desired <= b->m_data) return 0 ; - return do_realloc_bam_data(b, desired) ; + if (desired <= b->m_data) + return 0; + return do_realloc_bam_data(b, desired); } -void rebuild_bam_entry(bam1_t* alignment, char* seq, uint8_t* qual, vector> cigar) { - auto l_aux = bam_get_l_aux(alignment) ; - uint8_t* aux = (uint8_t*) malloc(sizeof(uint8_t) * l_aux) ; - memcpy(aux, alignment->data + alignment->l_data - l_aux, l_aux) ; - // update core - alignment->core.n_cigar = cigar.size() ; - alignment->core.l_qseq = strlen(seq) ; - int l = strlen(seq) ; - // rebuild data - int l_data = alignment->core.l_qname + (4 * alignment->core.n_cigar) + ((l + 1) >> 1) + l + l_aux ; - realloc_bam_data(alignment, l_data) ; - alignment->l_data = l_data ; - // copy qname - int offset = alignment->core.l_qname ; - // copy cigar - uint8_t* cigar_encoded = encode_cigar(cigar) ; - memcpy(alignment->data + offset, cigar_encoded, 4 * alignment->core.n_cigar) ; - offset += 4 * alignment->core.n_cigar ; - free(cigar_encoded) ; - // copy seq data - have to convert seq - uint8_t* seq_bytes = encode_bam_seq(seq) ; - memcpy(alignment->data + offset, seq_bytes, (l + 1) >> 1) ; - free(seq_bytes) ; - offset += ((l + 1) >> 1) ; - // copy quality - memcpy(alignment->data + offset, qual, l) ; - offset += l ; - // copy aux - memcpy(alignment->data + offset, aux, l_aux) ; - free(aux) ; +void rebuild_bam_entry(bam1_t *alignment, char *seq, uint8_t *qual, + vector> cigar) { + auto l_aux = bam_get_l_aux(alignment); + uint8_t *aux = (uint8_t *)malloc(sizeof(uint8_t) * l_aux); + memcpy(aux, alignment->data + alignment->l_data - l_aux, l_aux); + // update core + alignment->core.n_cigar = cigar.size(); + alignment->core.l_qseq = strlen(seq); + int l = strlen(seq); + // rebuild data + int l_data = alignment->core.l_qname + (4 * alignment->core.n_cigar) + + ((l + 1) >> 1) + l + l_aux; + realloc_bam_data(alignment, l_data); + alignment->l_data = l_data; + // copy qname + int offset = alignment->core.l_qname; + // copy cigar + uint8_t *cigar_encoded = encode_cigar(cigar); + memcpy(alignment->data + offset, cigar_encoded, 4 * alignment->core.n_cigar); + offset += 4 * alignment->core.n_cigar; + free(cigar_encoded); + // copy seq data - have to convert seq + uint8_t *seq_bytes = encode_bam_seq(seq); + memcpy(alignment->data + offset, seq_bytes, (l + 1) >> 1); + free(seq_bytes); + offset += ((l + 1) >> 1); + // copy quality + memcpy(alignment->data + offset, qual, l); + offset += l; + // copy aux + memcpy(alignment->data + offset, aux, l_aux); + free(aux); } -void Smoother::smooth_read(bam1_t* alignment, char* read_seq, string chrom, int _i, int _j, int _k) { - auto cigar_offsets = decode_cigar(alignment) ; - int l = 0 ; - // try and filter unintenresting reads early on - bool should_ignore = true ; - for (auto op: cigar_offsets) { - l += op.first ; - } - if (read_seq_max_lengths[_i][_j][_k] < l) { - free(read_seqs[_i][_j][_k]) ; - free(new_read_seqs[_i][_j][_k]) ; - free(new_read_quals[_i][_j][_k]) ; - // - read_seqs[_i][_j][_k] = (char*) malloc(sizeof(char) * (l + 1)) ; - new_read_seqs[_i][_j][_k] = (char*) malloc(sizeof(char) * (l + 1)) ; - new_read_quals[_i][_j][_k] = (uint8_t*) malloc(sizeof(char) * (l + 1)) ; - read_seq_max_lengths[_i][_j][_k] = l ; - } +void Smoother::smooth_read(bam1_t *alignment, char *read_seq, string chrom, + int _i, int _j, int _k) { + auto cigar_offsets = decode_cigar(alignment); + int l = 0; + // try and filter unintenresting reads early on + bool should_ignore = true; + for (auto op : cigar_offsets) { + l += op.first; + } + if (read_seq_max_lengths[_i][_j][_k] < l) { + free(read_seqs[_i][_j][_k]); + free(new_read_seqs[_i][_j][_k]); + free(new_read_quals[_i][_j][_k]); // - int n = 0 ; - int m = 0 ; - int ref_offset = alignment->core.pos ; - int ins_offset = 0 ; - int del_offset = 0 ; - int match_offset = 0 ; - int soft_clip_offset = 0 ; - char* new_seq = new_read_seqs[_i][_j][_k] ; - uint8_t* qual = bam_get_qual(alignment) ; - uint8_t* new_qual = new_read_quals[_i][_j][_k] ; - int pos = alignment->core.pos + 1 ; // this is 0-based, variant cpoordinates are 1-based - // Modify current bam1_t* struct - auto& core = alignment->core ; - vector> new_cigar ; - int m_diff = 0 ; - double num_match = 0 ; - double num_mismatch = 0 ; - char* ref_seq = chromosome_seqs[chrom] ; - while (true) { - if (m == cigar_offsets.size()) { - break ; - } - if (cigar_offsets[m].second == BAM_CMATCH || cigar_offsets[m].second == BAM_CEQUAL || cigar_offsets[m].second == BAM_CDIFF) { - memcpy(new_seq + n, ref_seq + ref_offset, cigar_offsets[m].first) ; - memcpy(new_qual + n, qual + match_offset + ins_offset + soft_clip_offset, cigar_offsets[m].first) ; - n += cigar_offsets[m].first ; - for (int j = 0; j < cigar_offsets[m].first; j++) { - num_mismatch += 1 ? ref_seq[ref_offset + j] != read_seq[match_offset + ins_offset + soft_clip_offset + j] : 0 ; - } - num_match += cigar_offsets[m].first ; - ref_offset += cigar_offsets[m].first ; - match_offset += cigar_offsets[m].first ; - if (new_cigar.size() >= 1 && new_cigar[new_cigar.size() - 1].second == BAM_CMATCH) { - new_cigar[new_cigar.size() - 1].first += cigar_offsets[m].first + m_diff ; - } else { - new_cigar.push_back(make_pair(cigar_offsets[m].first + m_diff, BAM_CMATCH)) ; - } - m_diff = 0 ; - } else if (cigar_offsets[m].second == BAM_CINS) { - if (cigar_offsets[m].first <= config->min_indel_length) { - // if a short INDEL then just don't add it to read - } else { - // for long INS, this is probably a SV so add it to the read - should_ignore = false ; - memcpy(new_seq + n, read_seq + soft_clip_offset + match_offset + ins_offset, cigar_offsets[m].first) ; - memcpy(new_qual + n, qual + soft_clip_offset + match_offset + ins_offset, cigar_offsets[m].first) ; - n += cigar_offsets[m].first ; - new_cigar.push_back(cigar_offsets[m]) ; - } - ins_offset += cigar_offsets[m].first ; - } else if (cigar_offsets[m].second == BAM_CDEL) { - if (cigar_offsets[m].first <= config->min_indel_length) { - // if a short DEL so let's just fix it - memcpy(new_seq + n, ref_seq + ref_offset, cigar_offsets[m].first) ; - memcpy(new_qual + n, qual + soft_clip_offset + match_offset + ins_offset, cigar_offsets[m].first) ; - n += cigar_offsets[m].first ; - m_diff += cigar_offsets[m].first ; - } else { - // for long DEL, this is probably a SV so let it be what it was - should_ignore = false ; - new_cigar.push_back(cigar_offsets[m]) ; - } - del_offset += cigar_offsets[m].first ; - ref_offset += cigar_offsets[m].first ; - } else if (cigar_offsets[m].second == BAM_CSOFT_CLIP) { - should_ignore = false ; - memcpy(new_seq + n, read_seq + soft_clip_offset + match_offset + ins_offset, cigar_offsets[m].first) ; - memcpy(new_qual + n, qual + soft_clip_offset + match_offset + ins_offset, cigar_offsets[m].first) ; - n += cigar_offsets[m].first ; - soft_clip_offset += cigar_offsets[m].first ; - new_cigar.push_back(cigar_offsets[m]) ; - } else { - cout << "Illegal Cigar OP" << endl ; - break ; - //if (cigar_offsets[m].second == BAM_CPAD || cigar_offsets[m].second == BAM_CHARD_CLIP || cigar_offsets[m].second == BAM_CBACK) { - } - m += 1 ; + read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1)); + new_read_seqs[_i][_j][_k] = (char *)malloc(sizeof(char) * (l + 1)); + new_read_quals[_i][_j][_k] = (uint8_t *)malloc(sizeof(char) * (l + 1)); + read_seq_max_lengths[_i][_j][_k] = l; + } + // + int n = 0; + int m = 0; + int ref_offset = alignment->core.pos; + int ins_offset = 0; + int del_offset = 0; + int match_offset = 0; + int soft_clip_offset = 0; + char *new_seq = new_read_seqs[_i][_j][_k]; + uint8_t *qual = bam_get_qual(alignment); + uint8_t *new_qual = new_read_quals[_i][_j][_k]; + int pos = alignment->core.pos + + 1; // this is 0-based, variant cpoordinates are 1-based + // Modify current bam1_t* struct + auto &core = alignment->core; + vector> new_cigar; + int m_diff = 0; + double num_match = 0; + double num_mismatch = 0; + char *ref_seq = chromosome_seqs[chrom]; + while (true) { + if (m == cigar_offsets.size()) { + break; } - new_seq[n] = '\0' ; - new_qual[n] = '\0' ; - //int r = 0 ; - //for (auto op: new_cigar) { - // if (op.second != BAM_CDEL) { - // n -= op.first ; - // } - // r += op.first ; - //} - //assert(n == 0) ; - char *qname = bam_get_qname(alignment) ; - //cout << n << " " << strlen(new_seq) << " " << r << endl ; - //cout << bam_get_qname(alignment) << endl ; - // only do this on first processing thread - if (omp_get_thread_num() == 2) { - global_num_bases += num_match ; - global_num_mismatch += num_mismatch ; + if (cigar_offsets[m].second == BAM_CMATCH || + cigar_offsets[m].second == BAM_CEQUAL || + cigar_offsets[m].second == BAM_CDIFF) { + memcpy(new_seq + n, ref_seq + ref_offset, cigar_offsets[m].first); + memcpy(new_qual + n, qual + match_offset + ins_offset + soft_clip_offset, + cigar_offsets[m].first); + n += cigar_offsets[m].first; + for (int j = 0; j < cigar_offsets[m].first; j++) { + num_mismatch += + 1 ? ref_seq[ref_offset + j] != + read_seq[match_offset + ins_offset + soft_clip_offset + j] + : 0; + } + num_match += cigar_offsets[m].first; + ref_offset += cigar_offsets[m].first; + match_offset += cigar_offsets[m].first; + if (new_cigar.size() >= 1 && + new_cigar[new_cigar.size() - 1].second == BAM_CMATCH) { + new_cigar[new_cigar.size() - 1].first += + cigar_offsets[m].first + m_diff; + } else { + new_cigar.push_back( + make_pair(cigar_offsets[m].first + m_diff, BAM_CMATCH)); + } + m_diff = 0; + } else if (cigar_offsets[m].second == BAM_CINS) { + if (cigar_offsets[m].first <= config->min_indel_length) { + // if a short INDEL then just don't add it to read + } else { + // for long INS, this is probably a SV so add it to the read + should_ignore = false; + memcpy(new_seq + n, + read_seq + soft_clip_offset + match_offset + ins_offset, + cigar_offsets[m].first); + memcpy(new_qual + n, + qual + soft_clip_offset + match_offset + ins_offset, + cigar_offsets[m].first); + n += cigar_offsets[m].first; + new_cigar.push_back(cigar_offsets[m]); + } + ins_offset += cigar_offsets[m].first; + } else if (cigar_offsets[m].second == BAM_CDEL) { + if (cigar_offsets[m].first <= config->min_indel_length) { + // if a short DEL so let's just fix it + memcpy(new_seq + n, ref_seq + ref_offset, cigar_offsets[m].first); + memcpy(new_qual + n, + qual + soft_clip_offset + match_offset + ins_offset, + cigar_offsets[m].first); + n += cigar_offsets[m].first; + m_diff += cigar_offsets[m].first; + } else { + // for long DEL, this is probably a SV so let it be what it was + should_ignore = false; + new_cigar.push_back(cigar_offsets[m]); + } + del_offset += cigar_offsets[m].first; + ref_offset += cigar_offsets[m].first; + } else if (cigar_offsets[m].second == BAM_CSOFT_CLIP) { + should_ignore = false; + memcpy(new_seq + n, + read_seq + soft_clip_offset + match_offset + ins_offset, + cigar_offsets[m].first); + memcpy(new_qual + n, qual + soft_clip_offset + match_offset + ins_offset, + cigar_offsets[m].first); + n += cigar_offsets[m].first; + soft_clip_offset += cigar_offsets[m].first; + new_cigar.push_back(cigar_offsets[m]); + } else { + cout << "Illegal Cigar OP" << endl; + break; + // if (cigar_offsets[m].second == BAM_CPAD || cigar_offsets[m].second == + // BAM_CHARD_CLIP || cigar_offsets[m].second == BAM_CBACK) { } - // how many errors and SNPs do we expect? 1/1000 each, so say if we see more than twice that then don't correct - if (config->selective) { - if (num_mismatch / num_match > 3 * expected_mismatch_rate) { - if (omp_get_thread_num() == 3) { - num_ignored_reads += 1 ; - } - return ; - } - // if we have so many deletions and insertions, then abort - if (ins_offset + del_offset > 0.7 * strlen(read_seq)) { - return ; - } + m += 1; + } + new_seq[n] = '\0'; + new_qual[n] = '\0'; + // int r = 0 ; + // for (auto op: new_cigar) { + // if (op.second != BAM_CDEL) { + // n -= op.first ; + // } + // r += op.first ; + // } + // assert(n == 0) ; + char *qname = bam_get_qname(alignment); + // cout << n << " " << strlen(new_seq) << " " << r << endl ; + // cout << bam_get_qname(alignment) << endl ; + // only do this on first processing thread + if (omp_get_thread_num() == 2) { + global_num_bases += num_match; + global_num_mismatch += num_mismatch; + } + // how many errors and SNPs do we expect? 1/1000 each, so say if we see more + // than twice that then don't correct + if (config->selective) { + if (num_mismatch / num_match > 3 * expected_mismatch_rate) { + if (omp_get_thread_num() == 3) { + num_ignored_reads += 1; + } + return; } - if (should_ignore) { - // check mismatch rate - ignored_reads[omp_get_thread_num() - 2].push_back(qname) ; - return ; + // if we have so many deletions and insertions, then abort + if (ins_offset + del_offset > 0.7 * strlen(read_seq)) { + return; } - smoothed_reads[omp_get_thread_num() - 2].push_back(qname) ; - rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar) ; + } + if (should_ignore) { + // check mismatch rate + ignored_reads[omp_get_thread_num() - 2].push_back(qname); + return; + } + smoothed_reads[omp_get_thread_num() - 2].push_back(qname); + rebuild_bam_entry(alignment, new_seq, new_qual, new_cigar); } -void Smoother::process_batch(vector bam_entries, int p, int i) { - bam1_t* alignment ; - for (int b = 0; b < bam_entries.size(); b++) { - alignment = bam_entries[b] ; - if (alignment == nullptr) { - break ; - } - if (alignment->core.flag & BAM_FUNMAP || alignment->core.flag & BAM_FSUPPLEMENTARY || alignment->core.flag & BAM_FSECONDARY) { - continue ; - } - if (alignment->core.l_qseq < 2) { - //cerr << "Read too short, ignoring.." << endl ; - continue ; - } - if (alignment->core.tid < 0) { - continue ; - } - string chrom(bam_header->target_name[alignment->core.tid]) ; - if (chromosome_seqs.find(chrom) == chromosome_seqs.end()) { - continue ; - } - smooth_read(alignment, read_seqs[p][i][b], chrom, p, i, b) ; +void Smoother::process_batch(vector bam_entries, int p, int i) { + bam1_t *alignment; + for (int b = 0; b < bam_entries.size(); b++) { + alignment = bam_entries[b]; + if (alignment == nullptr) { + break; + } + if (alignment->core.flag & BAM_FUNMAP || + alignment->core.flag & BAM_FSUPPLEMENTARY || + alignment->core.flag & BAM_FSECONDARY) { + continue; + } + if (alignment->core.l_qseq < 2) { + // cerr << "Read too short, ignoring.." << endl ; + continue; } + if (alignment->core.tid < 0) { + continue; + } + string chrom(bam_header->target_name[alignment->core.tid]); + if (chromosome_seqs.find(chrom) == chromosome_seqs.end()) { + continue; + } + smooth_read(alignment, read_seqs[p][i][b], chrom, p, i, b); + } } // BAM writing based on https://www.biostars.org/p/181580/ void Smoother::run() { - config = Configuration::getInstance() ; - load_chromosomes(config->reference) ; - // parse arguments - bam_file = hts_open(config->bam.c_str(), "r") ; - bam_index = sam_index_load(bam_file, config->bam.c_str()) ; - bam_header = sam_hdr_read(bam_file) ; //read header - bgzf_mt(bam_file->fp.bgzf, 8, 1) ; - auto out_bam_path = config->workdir + (config->selective ? "/smoothed.selective.bam" : "/smoothed.bam") ; - out_bam_file = hts_open(out_bam_path.c_str(), "wb") ; - bgzf_mt(out_bam_file->fp.bgzf, 8, 1) ; - int r = sam_hdr_write(out_bam_file, bam_header) ; - if (r < 0) { - lprint({"Can't write corrected BAM header, aborting.."}, 2); - return ; + config = Configuration::getInstance(); + load_chromosomes(config->reference); + // parse arguments + bam_file = hts_open(config->bam.c_str(), "r"); + bam_index = sam_index_load(bam_file, config->bam.c_str()); + bam_header = sam_hdr_read(bam_file); // read header + bgzf_mt(bam_file->fp.bgzf, 8, 1); + auto out_bam_path = + config->workdir + + (config->selective ? "/smoothed.selective.bam" : "/smoothed.bam"); + out_bam_file = hts_open(out_bam_path.c_str(), "wb"); + bgzf_mt(out_bam_file->fp.bgzf, 8, 1); + int r = sam_hdr_write(out_bam_file, bam_header); + if (r < 0) { + lprint({"Can't write corrected BAM header, aborting.."}, 2); + return; + } + // allocate stuff + int modulo = 3; + int batch_size = (10000 / config->threads) * config->threads; + for (int i = 0; i < modulo; i++) { + read_seqs.push_back( + vector>(config->threads)); // current and next output + new_read_seqs.push_back( + vector>(config->threads)); // current and next output + new_read_quals.push_back( + vector>(config->threads)); // current and next output + read_seq_lengths.push_back( + vector>(config->threads)); // current and next output + read_seq_max_lengths.push_back( + vector>(config->threads)); // current and next output + for (int j = 0; j < config->threads; j++) { + for (int k = 0; k < batch_size / config->threads; k++) { + read_seqs[i][j].push_back((char *)malloc(sizeof(char) * (30001))); + new_read_seqs[i][j].push_back((char *)malloc(sizeof(char) * (30001))); + new_read_quals[i][j].push_back( + (uint8_t *)malloc(sizeof(uint8_t) * (30001))); + // + read_seq_lengths[i][j].push_back(30000); + read_seq_max_lengths[i][j].push_back(30000); + } } - // allocate stuff - int modulo = 3 ; - int batch_size = (10000 / config->threads) * config->threads ; - for (int i = 0; i < modulo; i++) { - read_seqs.push_back(vector>(config->threads)) ; // current and next output - new_read_seqs.push_back(vector>(config->threads)) ; // current and next output - new_read_quals.push_back(vector>(config->threads)) ; // current and next output - read_seq_lengths.push_back(vector>(config->threads)) ; // current and next output - read_seq_max_lengths.push_back(vector>(config->threads)) ; // current and next output - for (int j = 0; j < config->threads; j++) { - for (int k = 0; k < batch_size / config->threads; k++) { - read_seqs[i][j].push_back((char*) malloc(sizeof(char) * (30001))) ; - new_read_seqs[i][j].push_back((char*) malloc(sizeof(char) * (30001))) ; - new_read_quals[i][j].push_back((uint8_t*) malloc(sizeof(uint8_t) * (30001))) ; - // - read_seq_lengths[i][j].push_back(30000) ; - read_seq_max_lengths[i][j].push_back(30000) ; - } - } + } + for (int i = 0; i < modulo; i++) { + bam_entries.push_back(vector>(config->threads)); + for (int j = 0; j < config->threads; j++) { + for (int k = 0; k < batch_size / config->threads; k++) { + bam_entries[i][j].push_back(bam_init1()); + } } - for (int i = 0; i < modulo; i++) { - bam_entries.push_back(vector>(config->threads)) ; - for (int j = 0; j < config->threads; j++) { - for (int k = 0; k < batch_size / config->threads; k++) { - bam_entries[i][j].push_back(bam_init1()) ; - } - } + } + int b = 0; + lprint({"Loading first batch.."}); + load_batch_bam(config->threads, batch_size, 1); + int p = 1; + ignored_reads.resize(config->threads); + smoothed_reads.resize(config->threads); + time_t t; + time(&t); + bool should_load = true; + bool should_process = true; + bool should_terminate = false; + bool loaded_last_batch = false; + int reads_written = 0; + // lprint({"Starting main loop.."}) ; + while (should_process) { + // lprint({"Beginning batch", to_string(b + 1)}); + if (!should_load) { + should_process = false; } - int b = 0 ; - lprint({"Loading first batch.."}); - load_batch_bam(config->threads, batch_size, 1) ; - int p = 1 ; - ignored_reads.resize(config->threads) ; - smoothed_reads.resize(config->threads) ; - time_t t ; - time(&t) ; - bool should_load = true ; - bool should_process = true ; - bool should_terminate = false ; - bool loaded_last_batch = false ; - int reads_written = 0 ; - //lprint({"Starting main loop.."}) ; - while (should_process) { - //lprint({"Beginning batch", to_string(b + 1)}); - if (!should_load) { - should_process = false ; - } - if (loaded_last_batch) { - should_load = false ; + if (loaded_last_batch) { + should_load = false; + } +#pragma omp parallel for num_threads(config->threads + 2) + for (int i = 0; i < config->threads + 2; i++) { + if (i == 0) { + if (should_load) { + loaded_last_batch = + !load_batch_bam(config->threads, batch_size, (p + 1) % modulo); + // if (loaded_last_batch) { + // lprint({"Last input batch loaded."}); + // } else { + // lprint({"Loaded."}); + // } } - #pragma omp parallel for num_threads(config->threads + 2) - for(int i = 0; i < config->threads + 2; i++) { - if (i == 0) { - if (should_load) { - loaded_last_batch = !load_batch_bam(config->threads, batch_size, (p + 1) % modulo) ; - //if (loaded_last_batch) { - // lprint({"Last input batch loaded."}); - //} else { - // lprint({"Loaded."}); - //} - } - } else if (i == 1) { - if (b >= 1) { - int ret = 0 ; - for (int k = 0; k < batch_size / config->threads; k++) { - for (int j = 0; j < config->threads; j++) { - if (bam_entries[(p + 2) % modulo][j][k] != nullptr) { - auto alignment = bam_entries[(p + 2) % modulo][j][k] ; - ret = sam_write1(out_bam_file, bam_header, bam_entries[(p + 2) % modulo][j][k]) ; - reads_written += 1 ; - if (ret < 0) { - lprint({"Can't write corrected BAM record, aborting.."}, 2); - should_terminate = true ; - break ; - } - } else { - break ; - } - } - } - } - } else { - if (should_process) { - process_batch(bam_entries[p][i - 2], p, i - 2) ; + } else if (i == 1) { + if (b >= 1) { + int ret = 0; + for (int k = 0; k < batch_size / config->threads; k++) { + for (int j = 0; j < config->threads; j++) { + if (bam_entries[(p + 2) % modulo][j][k] != nullptr) { + auto alignment = bam_entries[(p + 2) % modulo][j][k]; + ret = sam_write1(out_bam_file, bam_header, + bam_entries[(p + 2) % modulo][j][k]); + reads_written += 1; + if (ret < 0) { + lprint({"Can't write corrected BAM record, aborting.."}, 2); + should_terminate = true; + break; } + } else { + break; + } } + } } - if (should_terminate) { - lprint({"Something went wrong, aborting.."}, 2); - return ; + } else { + if (should_process) { + process_batch(bam_entries[p][i - 2], p, i - 2); } - //if (!should_load) { - // lprint({"Processed last batch of inputs."}); - //} - //if (!should_process) { - // lprint({"Exiting accelerator loop."}); - // break ; - //} - p += 1 ; - p %= modulo ; - b += 1 ; - time_t s ; - time(&s) ; - if (s - t == 0) { - s += 1 ; - } - cerr << "[I] Processed batch " << b << ". Reads so far " << reads_processed << ". Reads per second: " << reads_processed / (s - t) << ". Time: " << s - t << "\n" ; - cerr << "[I] Processed bases: " << uint64_t(global_num_bases) << ", num mismatch: " << uint64_t(global_num_mismatch) << ", mismatch rate: " << global_num_mismatch / global_num_bases << ", ignored reads: " << num_ignored_reads << "\n" ; - expected_mismatch_rate = global_num_mismatch / global_num_bases ; - cerr << "\x1b[A" ; - cerr << "\x1b[A" ; + } + } + if (should_terminate) { + lprint({"Something went wrong, aborting.."}, 2); + return; + } + // if (!should_load) { + // lprint({"Processed last batch of inputs."}); + // } + // if (!should_process) { + // lprint({"Exiting accelerator loop."}); + // break ; + // } + p += 1; + p %= modulo; + b += 1; + time_t s; + time(&s); + if (s - t == 0) { + s += 1; } - cerr << endl ; - cerr << endl ; - lprint({"Done."}); - sam_close(bam_file) ; - sam_close(out_bam_file) ; - dump_smoothed_read_ids() ; - lprint({"Loaded", to_string(reads_processed), "reads."}); - lprint({"Wrote", to_string(reads_written), "reads."}); + cerr << "[I] Processed batch " << b << ". Reads so far " << reads_processed + << ". Reads per second: " << reads_processed / (s - t) + << ". Time: " << s - t << "\n"; + cerr << "[I] Processed bases: " << uint64_t(global_num_bases) + << ", num mismatch: " << uint64_t(global_num_mismatch) + << ", mismatch rate: " << global_num_mismatch / global_num_bases + << ", ignored reads: " << num_ignored_reads << "\n"; + expected_mismatch_rate = global_num_mismatch / global_num_bases; + cerr << "\x1b[A"; + cerr << "\x1b[A"; + } + cerr << endl; + cerr << endl; + lprint({"Done."}); + sam_close(bam_file); + sam_close(out_bam_file); + dump_smoothed_read_ids(); + lprint({"Loaded", to_string(reads_processed), "reads."}); + lprint({"Wrote", to_string(reads_written), "reads."}); } void Smoother::dump_smoothed_read_ids() { - lprint({"Dumping smoothed read ids.."}) ; - ofstream qname_file(config->workdir + "/smoothed_reads.txt") ; - if (qname_file.is_open()) { - for (int i = 0; i < config->threads; i++) { - for (const auto& qname: smoothed_reads[i]) { - qname_file << qname << endl ; - } - } - } else { - lprint({"Error openning smoothed_reads.txt."}, 2) ; + lprint({"Dumping smoothed read ids.."}); + ofstream qname_file(config->workdir + "/smoothed_reads.txt"); + if (qname_file.is_open()) { + for (int i = 0; i < config->threads; i++) { + for (const auto &qname : smoothed_reads[i]) { + qname_file << qname << endl; + } } - qname_file.close() ; - ofstream ignore_file(config->workdir + "/ignored_reads.txt") ; - if (ignore_file.is_open()) { - for (int i = 0; i < config->threads; i++) { - for (const auto& qname: ignored_reads[i]) { - ignore_file << qname << endl ; - } - } - ignore_file.close() ; - } else { - lprint({"Error openning ignored_read.txt."}, 2) ; + } else { + lprint({"Error openning smoothed_reads.txt."}, 2); + } + qname_file.close(); + ofstream ignore_file(config->workdir + "/ignored_reads.txt"); + if (ignore_file.is_open()) { + for (int i = 0; i < config->threads; i++) { + for (const auto &qname : ignored_reads[i]) { + ignore_file << qname << endl; + } } + ignore_file.close(); + } else { + lprint({"Error openning ignored_read.txt."}, 2); + } } bool Smoother::load_batch_bam(int threads, int batch_size, int p) { - int n = 0 ; - int i = 0 ; - int m = 0 ; - while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { - auto alignment = bam_entries[p][n % threads][i] ; - if (alignment == nullptr) { - break ; - } - reads_processed += 1 ; - uint32_t l = alignment->core.l_qseq ; //length of the read - if (read_seq_max_lengths[p][n % threads][i] < l) { - free(read_seqs[p][n % threads][i]) ; - free(new_read_seqs[p][n % threads][i]) ; - free(new_read_quals[p][n % threads][i]) ; - // - read_seqs[p][n % threads][i] = (char*) malloc(sizeof(char) * (l + 1)) ; - new_read_seqs[p][n % threads][i] = (char*) malloc(sizeof(char) * (l + 1)) ; - new_read_quals[p][n % threads][i] = (uint8_t*) malloc(sizeof(char) * (l + 1)) ; - read_seq_max_lengths[p][n % threads][i] = l ; - m += 1 ; - } - read_seq_lengths[p][n % threads][i] = l ; - uint8_t *q = bam_get_seq(alignment) ; - for (int _ = 0; _ < l; _++){ - read_seqs[p][n % threads][i][_] = seq_nt16_str[bam_seqi(q, _)] ; - } - read_seqs[p][n % threads][i][l] = '\0' ; - n += 1 ; - if (n % threads == 0) { - i += 1 ; - } - if (n == batch_size) { - break ; - } + int n = 0; + int i = 0; + int m = 0; + while (sam_read1(bam_file, bam_header, bam_entries[p][n % threads][i]) >= 0) { + auto alignment = bam_entries[p][n % threads][i]; + if (alignment == nullptr) { + break; } - //cout << m << " reallocations.." << endl ; - // last batch was incomplete - if (n != batch_size) { - for (int j = n % threads; j < threads; j++) { - //cout << "Terminus at " << j << " " << i << endl ; - for (int _ = i; _ < batch_size / threads; _++) { - bam_entries[p][j][_] = nullptr ; - } - } - for (int j = 0; j < n % threads; j++) { - //cout << "Terminus at " << j << " " << i + 1 << endl ; - for (int _ = i + 1; _ < batch_size / threads; _++) { - bam_entries[p][j][_] = nullptr ; - } - } + reads_processed += 1; + uint32_t l = alignment->core.l_qseq; // length of the read + if (read_seq_max_lengths[p][n % threads][i] < l) { + free(read_seqs[p][n % threads][i]); + free(new_read_seqs[p][n % threads][i]); + free(new_read_quals[p][n % threads][i]); + // + read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1)); + new_read_seqs[p][n % threads][i] = (char *)malloc(sizeof(char) * (l + 1)); + new_read_quals[p][n % threads][i] = + (uint8_t *)malloc(sizeof(char) * (l + 1)); + read_seq_max_lengths[p][n % threads][i] = l; + m += 1; + } + read_seq_lengths[p][n % threads][i] = l; + uint8_t *q = bam_get_seq(alignment); + for (int _ = 0; _ < l; _++) { + read_seqs[p][n % threads][i][_] = seq_nt16_str[bam_seqi(q, _)]; + } + read_seqs[p][n % threads][i][l] = '\0'; + n += 1; + if (n % threads == 0) { + i += 1; + } + if (n == batch_size) { + break; + } + } + // cout << m << " reallocations.." << endl ; + // last batch was incomplete + if (n != batch_size) { + for (int j = n % threads; j < threads; j++) { + // cout << "Terminus at " << j << " " << i << endl ; + for (int _ = i; _ < batch_size / threads; _++) { + bam_entries[p][j][_] = nullptr; + } + } + for (int j = 0; j < n % threads; j++) { + // cout << "Terminus at " << j << " " << i + 1 << endl ; + for (int _ = i + 1; _ < batch_size / threads; _++) { + bam_entries[p][j][_] = nullptr; + } } - //lprint({"Loaded", to_string(n), "BAM reads.."}); - return n != 0 ? true : false ; + } + // lprint({"Loaded", to_string(n), "BAM reads.."}); + return n != 0 ? true : false; } diff --git a/smoother.hpp b/smoother.hpp index 6ca2fab..1647638 100644 --- a/smoother.hpp +++ b/smoother.hpp @@ -1,70 +1,71 @@ #ifndef SNP_HPP #define SNP_HPP -#include -#include -#include -#include +#include #include -#include +#include #include #include -#include #include +#include #include +#include +#include #include +#include #include #include "bam.hpp" -#include "vcf.hpp" -#include "fastq.hpp" +#include "chromosomes.hpp" #include "config.hpp" +#include "fastq.hpp" #include "lprint.hpp" -#include "chromosomes.hpp" +#include "vcf.hpp" -using namespace std ; +using namespace std; -#define bam_set_seqi(s,i,b) ((s)[(i)>>1] = ((s)[(i)>>1] & (0xf0 >> ((~(i)&1)<<2))) | ((b)<<((~(i)&1)<<2))) +#define bam_set_seqi(s, i, b) \ + ((s)[(i) >> 1] = \ + ((s)[(i) >> 1] & (0xf0 >> ((~(i)&1) << 2))) | ((b) << ((~(i)&1) << 2))) class Smoother { public: + // Loading BAM file + samFile *bam_file; + bam_hdr_t *bam_header; + hts_idx_t *bam_index; + // output BAM file + samFile *out_bam_file; + //