Skip to content

Commit

Permalink
Merge SFS and ExtSFS into single struct
Browse files Browse the repository at this point in the history
  • Loading branch information
ldenti committed Jul 26, 2023
1 parent 1bbd4a8 commit 1c5acec
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 106 deletions.
10 changes: 5 additions & 5 deletions assembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ vector<SFS> Assembler::assemble(vector<SFS> &sfs) {
while (i < sfs.size()) {
size_t j;
for (j = i + 1; j < sfs.size(); ++j) {
if (sfs[j - 1].s + sfs[j - 1].l <= sfs[j].s) {
if (sfs[j - 1].qs + sfs[j - 1].l <= sfs[j].qs) {
// non-overlapping
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s;
assembled_sfs.push_back(SFS(sfs[i].s, l, sfs[i].htag));
uint l = sfs[j - 1].qs + sfs[j - 1].l - sfs[i].qs;
assembled_sfs.push_back(SFS(sfs[i].qname, sfs[i].qs, l, sfs[i].htag));
i = j;
break;
}
}
if (j == sfs.size()) {
uint l = sfs[j - 1].s + sfs[j - 1].l - sfs[i].s;
assembled_sfs.push_back(SFS(sfs[i].s, l, sfs[i].htag));
uint l = sfs[j - 1].qs + sfs[j - 1].l - sfs[i].qs;
assembled_sfs.push_back(SFS(sfs[i].qname, sfs[i].qs, l, sfs[i].htag));
i = j;
}
}
Expand Down
72 changes: 36 additions & 36 deletions clusterer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ void Clusterer::extend_alignment(bam1_t *aln, int index) {
pair<uint, uint> lclip = make_pair(0, 0);
pair<uint, uint> rclip = make_pair(0, 0);
int last_pos = 0;
vector<ExtSFS> local_extended_sfs;
vector<SFS> local_extended_sfs;
for (const SFS &sfs : SFSs->at(qname)) {
int s = sfs.s;
int e = sfs.s + sfs.l - 1;
int s = sfs.qs;
int e = sfs.qs + sfs.l - 1;
int hp_tag = sfs.htag;
int aln_start = -1;
int aln_end = -1;
Expand Down Expand Up @@ -297,33 +297,33 @@ void Clusterer::extend_alignment(bam1_t *aln, int index) {
}
// FIXME: understand why this is happening (chr16 on full giab genome)
if ((uint)prekmer.second > postkmer.second + config->ksize) {
spdlog::warn("Error on {}. SFS starting at {} (length {})", qname, sfs.s,
spdlog::warn("Error on {}. SFS starting at {} (length {})", qname, sfs.qs,
sfs.l);
} else {
local_extended_sfs.push_back(
ExtSFS(string(chrom), string(qname), prekmer.second,
postkmer.second + config->ksize, prekmer.first,
postkmer.first + config->ksize, hp_tag));
SFS(string(chrom), string(qname), prekmer.second,
postkmer.second + config->ksize, prekmer.first,
postkmer.first + config->ksize, hp_tag));
}
}
// When two SFSs are close but not overlapping, we may end up with two
// overlapping extended SFSs. We need to merge SFSs two by two until we don't
// need to merge anything
vector<ExtSFS> merged_extended_sfs;
vector<SFS> merged_extended_sfs;
for (size_t i = 0; i < local_extended_sfs.size(); ++i) {
size_t j;
for (j = 0; j < merged_extended_sfs.size(); ++j) {
if ((local_extended_sfs.at(i).s <= merged_extended_sfs.at(j).s &&
merged_extended_sfs.at(j).s <= local_extended_sfs.at(i).e) ||
(merged_extended_sfs.at(j).s <= local_extended_sfs.at(i).s &&
local_extended_sfs.at(i).s <= merged_extended_sfs.at(j).e))
if ((local_extended_sfs.at(i).rs <= merged_extended_sfs.at(j).rs &&
merged_extended_sfs.at(j).rs <= local_extended_sfs.at(i).re) ||
(merged_extended_sfs.at(j).rs <= local_extended_sfs.at(i).rs &&
local_extended_sfs.at(i).rs <= merged_extended_sfs.at(j).re))
break;
}
if (j < merged_extended_sfs.size()) {
merged_extended_sfs[j].s =
min(merged_extended_sfs.at(j).s, local_extended_sfs.at(i).s);
merged_extended_sfs[j].e =
max(merged_extended_sfs.at(j).e, local_extended_sfs.at(i).e);
merged_extended_sfs[j].rs =
min(merged_extended_sfs.at(j).rs, local_extended_sfs.at(i).rs);
merged_extended_sfs[j].re =
max(merged_extended_sfs.at(j).re, local_extended_sfs.at(i).re);
merged_extended_sfs[j].qs =
min(merged_extended_sfs.at(j).qs, local_extended_sfs.at(i).qs);
merged_extended_sfs[j].qe =
Expand Down Expand Up @@ -405,16 +405,16 @@ Clusterer::get_unique_kmers(const vector<pair<int, int>> &alpairs, const uint k,
void Clusterer::cluster_by_proximity() {
sort(extended_SFSs.begin(), extended_SFSs.end());
auto r = max_element(extended_SFSs.begin(), extended_SFSs.end(),
[](const ExtSFS &lhs, const ExtSFS &rhs) {
return lhs.e - lhs.s < rhs.e - rhs.s;
[](const SFS &lhs, const SFS &rhs) {
return lhs.re - lhs.rs < rhs.re - rhs.rs;
});
int dist = (r->e - r->s) * 1.1; // TODO: add to CLI
int dist = (r->re - r->rs) * 1.1; // TODO: add to CLI
spdlog::info(
"Maximum extended SFS length: {}bp. Using separation distance: {}bp.",
r->e - r->s, dist);
r->re - r->rs, dist);
// Cluster SFSs inside dist-bp windows
int prev_i = 0;
int prev_e = extended_SFSs[0].e;
int prev_e = extended_SFSs[0].re;
string prev_chrom = extended_SFSs[0].chrom;
vector<pair<int, int>> intervals;
for (size_t i = 1; i < extended_SFSs.size(); i++) {
Expand All @@ -424,12 +424,12 @@ void Clusterer::cluster_by_proximity() {
prev_chrom = sfs.chrom;
intervals.push_back(make_pair(prev_i, i - 1));
prev_i = i;
prev_e = sfs.e;
prev_e = sfs.re;
continue;
} else {
if (sfs.s - prev_e > dist) {
if (sfs.rs - prev_e > dist) {
intervals.push_back(make_pair(prev_i, i - 1));
prev_e = sfs.e;
prev_e = sfs.re;
prev_i = i;
}
}
Expand All @@ -438,28 +438,28 @@ void Clusterer::cluster_by_proximity() {

// Cluster SFS inside each interval
_p_sfs_clusters.resize(
config->threads); // vector<map<pair<int, int>, vector<ExtSFS>>>
config->threads); // vector<map<pair<int, int>, vector<SFS>>>
#pragma omp parallel for num_threads(config->threads) schedule(static, 1)
for (size_t i = 0; i < intervals.size(); i++) {
int t = omp_get_thread_num();
int j = intervals[i].first;
int low = extended_SFSs[j].s;
int high = extended_SFSs[j].e;
int low = extended_SFSs[j].rs;
int high = extended_SFSs[j].re;
int last_j = j;
j++;
for (; j <= intervals[i].second; j++) {
const ExtSFS &sfs = extended_SFSs[j];
if (sfs.s <= high) {
low = min(low, sfs.s);
high = max(high, sfs.e);
const SFS &sfs = extended_SFSs[j];
if (sfs.rs <= high) {
low = min(low, sfs.rs);
high = max(high, sfs.re);
} else {
for (int k = last_j; k < j;
k++) { // CHECKME: < or <=?
// NOTE: <= makes the code waaaay slower
_p_sfs_clusters[t][make_pair(low, high)].push_back(extended_SFSs[k]);
}
low = sfs.s;
high = sfs.e;
low = sfs.rs;
high = sfs.re;
last_j = j;
}
}
Expand Down Expand Up @@ -499,9 +499,9 @@ void Clusterer::fill_clusters() {
set<string> reads;
int min_s = numeric_limits<int>::max();
int max_e = 0;
for (const ExtSFS &sfs : cluster.SFSs) {
min_s = min(min_s, sfs.s);
max_e = max(max_e, sfs.e);
for (const SFS &sfs : cluster.SFSs) {
min_s = min(min_s, sfs.rs);
max_e = max(max_e, sfs.re);
reads.insert(sfs.qname);
}

Expand Down
10 changes: 5 additions & 5 deletions clusterer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ struct Cluster {
int s;
int e;
int cov;
vector<ExtSFS> SFSs;
vector<SFS> SFSs;
vector<string> names;
vector<string> seqs;

Cluster(){};

Cluster(const vector<ExtSFS> &_SFSs) {
Cluster(const vector<SFS> &_SFSs) {
SFSs = _SFSs;
chrom = _SFSs[0].chrom;
}
Expand Down Expand Up @@ -78,7 +78,7 @@ class Clusterer {
private:
Configuration *config;
unordered_map<string, vector<SFS>> *SFSs;
vector<ExtSFS> extended_SFSs;
vector<SFS> extended_SFSs;
samFile *bam_file;
hts_idx_t *bam_index;
bam_hdr_t *bam_header;
Expand Down Expand Up @@ -107,9 +107,9 @@ class Clusterer {

// parallelize
vector<vector<Clip>> _p_clips;
vector<vector<ExtSFS>> _p_extended_sfs;
vector<vector<SFS>> _p_extended_sfs;
vector<vector<vector<bam1_t *>>> bam_entries;
vector<map<pair<int, int>, vector<ExtSFS>>> _p_sfs_clusters;
vector<map<pair<int, int>, vector<SFS>>> _p_sfs_clusters;

public:
Clusterer(unordered_map<string, vector<SFS>> *);
Expand Down
20 changes: 9 additions & 11 deletions ping_pong.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ void seq_char2nt6(int l, unsigned char *s) {
}

/* Compute SFS strings from P and store them into solutions*/
void PingPong::ping_pong_search(rld_t *index, uint8_t *P, int l,
vector<sfs_type_t> &solutions, int hp_tag) {
void PingPong::ping_pong_search(rld_t *index, const string &qname, uint8_t *P,
int l, vector<SFS> &solutions, int hp_tag) {
rldintv_t sai;
int begin = l - 1;
while (begin >= 0) {
Expand Down Expand Up @@ -58,16 +58,14 @@ void PingPong::ping_pong_search(rld_t *index, uint8_t *P, int l,
// fmatches: " << fmatches << endl ;) DEBUG(cerr << "Adding [" << begin <<
// ", " << end << "]." << endl ;)
int sfs_len = end - begin + 1;
SFS sfs = SFS{begin, sfs_len, hp_tag};
SFS sfs(qname, begin, sfs_len, hp_tag);
solutions.push_back(sfs);
if (begin == 0) {
if (begin == 0)
break;
}
if (config->overlap == 0) { // Relaxed
if (config->overlap == 0) // Relaxed
begin -= 1;
} else {
else
begin = end + config->overlap; // overlap < 0
}
}
}

Expand Down Expand Up @@ -191,7 +189,7 @@ batch_type_t PingPong::process_batch(rld_t *index, int p, int thread) {
if (!bam_mode) {
for (uint j = 0; j < read_seqs[p][thread].size(); j++) {
ping_pong_search(
index, read_seqs[p][thread][j],
index, read_names[p][thread][j], read_seqs[p][thread][j],
strlen(
(char *)read_seqs[p][thread]
[j]), // FIXME: this may be inefficient. We were
Expand All @@ -212,7 +210,7 @@ batch_type_t PingPong::process_batch(rld_t *index, int p, int thread) {
: 0;
if (config->putative and xf_t != 0)
continue;
ping_pong_search(index, read_seqs[p][thread][j], aln->core.l_qseq,
ping_pong_search(index, qname, read_seqs[p][thread][j], aln->core.l_qseq,
solutions[qname], hp_t);
}
}
Expand All @@ -235,7 +233,7 @@ void PingPong::output_batch(int b) {
for (const SFS &sfs : assembled_SFSs) {
// optimize file output size by not outputting read name for every
// SFS
cout << (is_first ? read.first : "*") << "\t" << sfs.s << "\t"
cout << (is_first ? read.first : "*") << "\t" << sfs.qs << "\t"
<< sfs.l << "\t" << sfs.htag << "\t" << endl;
is_first = false;
}
Expand Down
13 changes: 6 additions & 7 deletions ping_pong.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s) {
return l;
}

typedef SFS sfs_type_t;
typedef map<string, vector<sfs_type_t>> batch_type_t;
typedef map<string, vector<SFS>> batch_type_t;

static const vector<string> int2char({"$", "A", "C", "G", "T", "N"});

Expand All @@ -95,11 +94,11 @@ class PingPong {
vector<vector<vector<string>>> read_names;
vector<vector<vector<bam1_t *>>> bam_entries;
vector<vector<vector<fastq_entry_t>>> fastq_entries;
bool load_batch_bam(int p);
bool load_batch_fastq(int threads, int batch_size, int p);
batch_type_t process_batch(rld_t *index, int p, int i);
void ping_pong_search(rld_t *index, uint8_t *seq, int l,
vector<sfs_type_t> &solutions, int hp);
bool load_batch_bam(int);
bool load_batch_fastq(int, int, int);
batch_type_t process_batch(rld_t *, int, int);
void ping_pong_search(rld_t *, const string &, uint8_t *, int, vector<SFS> &,
int);
void output_batch(int);

vector<vector<batch_type_t>> obatches;
Expand Down
5 changes: 1 addition & 4 deletions sfs.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "sfs.hpp"

bool operator<(const SFS &x, const SFS &y) { return x.s < y.s; }

// TODO: if we split the .sfs into more files, we can load it using more threads
// (but for SVs having a single file shouldn't be the bottleneck)
unordered_map<string, vector<SFS>> parse_sfsfile(const string &sfs_path) {
Expand All @@ -22,9 +20,8 @@ unordered_map<string, vector<SFS>> parse_sfsfile(const string &sfs_path) {
read_name = info[0];
SFSs[read_name] = vector<SFS>();
}
// TODO
SFSs[read_name].push_back(
SFS(stoi(info[1]), stoi(info[2]), stoi(info[3])));
SFS(read_name, stoi(info[1]), stoi(info[2]), stoi(info[3])));
++total;
}
}
Expand Down
Loading

0 comments on commit 1c5acec

Please sign in to comment.