-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.hpp
414 lines (359 loc) · 13.3 KB
/
utils.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#pragma once
#include <charconv>
#include <cstdio>
#include <filesystem>
#include <fstream>
#include <indicators.hpp>
#include <json.hpp>
#include <memory>
#include <random>
#include <robinhood.hpp>
#include <string_view>
#include <unordered_map>
#include <xhash.hpp>
#include "alphabet.hpp"
using namespace indicators; // for progress bars
using json = nlohmann::json;
namespace fs = std::filesystem;
// type definitions
using raw_seq_t = std::vector<char>; // just a normal ascii sequence
using encoded_seq_t = std::vector<int8_t>; // a sequence encoded into an alphabet
using dataset_t = std::vector<encoded_seq_t>; // list of encoded sequences
using seqview_t = std::basic_string_view<int8_t>; // view of encoded sequence
template <typename K, typename V> using umap_t = robin_hood::unordered_map<K, V>;
using kmap_t = umap_t<seqview_t, std::vector<uint64_t>>; // seqview -> counts
using datacount_t = umap_t<uint64_t, std::vector<uint64_t>>; // counts per dataset id
using multikmap_t = umap_t<seqview_t, datacount_t>; // seqview -> counts per dataset
/*inline datacount_t deserialize(const std::string& s) {*/
// datacount_t d{};
// auto j = json::parse(s);
// for (const auto& e : j.items())
// d[atoll(e.key().c_str())] = e.value().get<std::vector<uint64_t>>();
// return d;
//}
// inline std::string serialize(const datacount_t& d) {
// json j{};
// for (const auto& [k, v] : d) j[std::to_string(k)] = v;
// return j.dump();
//}
// parse the first number before delim, returns the rest of the substring, not including
// delim
template <typename T>
std::string_view firstFromChars(const std::string_view& sv, char delim, T& res) {
size_t p = sv.find_first_of(delim);
std::from_chars(sv.data(), sv.data() + p, res);
return sv.substr(std::min(p + 1, sv.size()), sv.size() - p);
}
// splits a string view using delim, and calls f for each substring
template <typename F>
void splt(const std::string_view& sv, char delim, F&& f, size_t i = 0) {
if (sv.size() > 0) {
size_t p = sv.find_first_of(delim);
std::forward<F>(f)(sv.substr(0, p), i);
if (p != std::string_view::npos) {
splt(sv.substr(std::min(p + 1, sv.size()), sv.size() - p), delim,
std::forward<F>(f), i + 1);
}
}
}
inline datacount_t deserialize(const std::string& s) {
datacount_t d;
if (s.size() == 0) return d;
std::string_view sv(s);
// getting number of letters in alphabet
uint32_t nAlpha;
sv = firstFromChars(sv, 'A', nAlpha);
// for each dataset
splt(sv, ' ', [&](const auto& dcpair, int) {
uint64_t datasetId;
auto countstr = firstFromChars(dcpair, '|', datasetId);
std::vector<uint64_t> countArr(nAlpha);
// for each pair letter:count
splt(countstr, ',', [&](const auto& dcpair, int) {
uint64_t lc[2] = {0, 0}; // [letter, count]
splt(dcpair, ':', [&](const auto& substr, int i) {
std::from_chars(substr.data(), substr.data() + substr.size(), lc[i]);
});
countArr[lc[0]] = lc[1];
});
d[datasetId] = countArr;
});
return d;
}
inline std::string serialize(const datacount_t& d) {
std::string s;
if (d.size() == 0) return s;
uint32_t nAlpha = d.begin()->second.size();
for (const auto& [k, v] : d) {
s += std::to_string(k) + "|";
for (size_t i = 0; i < v.size(); ++i) {
if (v[i] > 0) s += std::to_string(i) + ":" + std::to_string(v[i]) + ",";
}
s.pop_back();
s += " ";
}
s.pop_back();
return std::to_string(nAlpha) + "A" + s;
}
inline void addCounts(std::vector<uint64_t>& a, const std::vector<uint64_t>& b) {
if (a.size() == 0)
a = b;
else
for (size_t i = 0; i < a.size(); ++i) a[i] += b[i];
}
// encode / decode. Important for 2 reasons:
// - normalizing uppercase & lowercases
// - encoding of a letter = index in the count array
static const constexpr Alpha alphaMap{};
inline encoded_seq_t encodeSequence(const raw_seq_t& s, const Alphabet& alpha) {
encoded_seq_t r(s.size() + 2); // don't forget prepend and append symbols
r[0] = alphaMap.encode('[', alpha);
for (size_t i = 1; i < r.size() - 1; ++i) r[i] = alphaMap.encode(s[i - 1], alpha);
r[r.size() - 1] = alphaMap.encode(']', alpha);
return r;
}
inline raw_seq_t decodeSequence(const encoded_seq_t& s, const Alphabet& alpha) {
const auto decode = (alpha == Alphabet::dna) ?
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::dna>(c); } :
((alpha == Alphabet::rna) ?
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::rna>(c); } :
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::protein>(c); });
raw_seq_t r(s.size() - 2);
for (size_t i = 0; i < s.size() - 2; ++i) r[i] = decode(s[i + 1]);
return r;
}
inline raw_seq_t decodeSequenceView(const seqview_t& s, const Alphabet& alpha) {
const auto decode = (alpha == Alphabet::dna) ?
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::dna>(c); } :
((alpha == Alphabet::rna) ?
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::rna>(c); } :
[](const int8_t& c) -> char { return alphaMap.decode<Alphabet::protein>(c); });
raw_seq_t r(s.size());
for (size_t i = 0; i < s.size(); ++i) r[i] = decode(s[i]);
return r;
}
struct PBar {
int barid = -1;
DynamicProgress<ProgressBar>* bars = nullptr;
size_t total = 100;
size_t ticks = 0;
size_t ticksFromStart = 0;
PBar(DynamicProgress<ProgressBar>* bars, size_t total, const std::string& msg)
: bars(bars), total(total) {
if (bars) {
barid = bars->make_bar(option::BarWidth{50}, option::Start{"["}, option::Fill{"■"},
option::Lead{"■"}, option::Remainder{"-"}, option::End{"]"},
option::PostfixText{msg}, option::ShowElapsedTime{true},
option::ForegroundColor{Color::yellow},
option::MaxProgress{total + 1},
option::FontStyles{std::vector<FontStyle>{FontStyle::bold}});
}
}
inline void step(size_t i = 1) {
if (bars) {
ticks += i;
ticksFromStart += i;
if (ticks > total / 1000) {
// std::cerr << "tick " << ticksFromStart << " / " << total << std::endl;
if (ticksFromStart <= total) (*bars)[barid].tick(ticks);
ticks = 0;
}
}
}
inline void completeMsg(const std::string& msg) {
if (bars) {
auto& b = (*bars)[barid];
b.set_option(option::ForegroundColor{Color::green});
b.set_option(option::PrefixText{"✔ " + msg});
b.set_option(option::BarWidth{0});
b.set_option(option::Fill{""});
b.set_option(option::Lead{""});
b.set_option(option::Start{""});
b.set_option(option::End{""});
b.set_option(option::ShowPercentage{false});
b.set_option(option::PostfixText{" "});
(*bars)[barid]; // update
}
}
inline void complete() {
if (bars) {
(*bars)[barid].mark_as_completed();
(*bars)[barid]; // update
}
}
};
template <typename B>
std::vector<encoded_seq_t> readFasta(const std::string& filepath, const Alphabet& alpha,
B& pbars, size_t barid) {
fs::path p{filepath};
std::vector<encoded_seq_t> res;
std::ifstream file(filepath.c_str());
size_t charactersRead = 0;
raw_seq_t currentSeq{};
bool record = true; // should we record the lines. Useful for fastQ format
auto recordSeq = [&]() { // pushes a new sequence and empty currentSeq
if (currentSeq.size() > 0)
res.push_back(encodeSequence(std::exchange(currentSeq, {}), alpha));
};
for (std::string line{}; std::getline(file, line);) {
if (line[0] == '>' || line[0] == '@') {
record = true;
recordSeq();
} else if (line[0] == '+') {
record = false;
} else {
if (record) {
charactersRead += line.size();
currentSeq.insert(currentSeq.end(), line.begin(), line.end());
}
}
if (charactersRead > 10000) pbars[barid].tick(std::exchange(charactersRead, 0));
}
recordSeq();
pbars[barid].tick(charactersRead);
return res;
}
// returns the number of sequence letters in a fasta file
inline size_t getFastaSize(const fs::path& filepath) {
std::ifstream file(filepath.c_str());
size_t nLetters = 0;
bool fastq = false;
for (std::string line{}; std::getline(file, line);) {
if (line[0] != '>' && line[0] != '@' && line[0] != '+')
nLetters += line.size();
else if (line[0] == '@')
fastq = true;
}
if (fastq)
return nLetters / 2;
else
return nLetters;
}
inline auto readPaths(const std::string& inputFile) {
fs::path filePath{fs::absolute(fs::path(inputFile))};
auto basePath = filePath.parent_path();
std::ifstream file(filePath.c_str());
std::vector<fs::path> paths;
std::vector<size_t> sizes;
for (std::string line{}; std::getline(file, line);) {
fs::path datapath = fs::path(basePath) / line;
paths.push_back(datapath);
sizes.push_back(getFastaSize(datapath));
}
return std::make_pair(paths, sizes);
}
inline std::pair<std::vector<dataset_t>, size_t> readDatasets(
const std::vector<fs::path>& paths, const std::vector<size_t>& sizes, size_t offset,
size_t maxSize, Alphabet alpha, DynamicProgress<ProgressBar>& bars) {
assert(paths.size() == sizes.size());
size_t readSize = 0;
size_t next = offset;
while (next < sizes.size() && readSize < maxSize) {
readSize += sizes[next];
++next;
}
auto barid = bars.make_bar(
option::BarWidth{50}, option::Start{"["}, option::Fill{"■"}, option::Lead{"■"},
option::Remainder{"-"}, option::End{"]"}, option::ShowElapsedTime{true},
option::ForegroundColor{Color::yellow}, option::MaxProgress{readSize},
option::FontStyles{std::vector<FontStyle>{FontStyle::bold}});
std::vector<dataset_t> datasets;
for (size_t i = offset; i < next; ++i) {
bars[barid].set_option(option::PostfixText{"Loading " + paths[i].string()});
datasets.push_back(readFasta(paths[i], alpha, bars, barid));
}
bars[barid].mark_as_completed();
return {datasets, next};
}
// generate N random dna sequences of size L
inline std::vector<encoded_seq_t> randomDNASequences(int L, int N) {
const std::vector<char> letters{'A', 'C', 'G', 'T'};
std::random_device rd;
std::mt19937 gen(0);
std::uniform_int_distribution<> d(0, letters.size() - 1);
std::vector<encoded_seq_t> allseqs;
for (int i = 0; i < N; ++i) {
auto s = std::vector<char>(L);
for (auto& n : s) n = letters[d(gen)];
allseqs.push_back(encodeSequence(s, Alphabet::dna));
}
return allseqs;
}
inline raw_seq_t leftpad(const raw_seq_t& s, int k) {
// ensure that the sequence is the right length (prepends '[' if not)
auto leftpad = raw_seq_t(k - s.size(), '[');
leftpad.insert(leftpad.end(), s.begin(), s.end());
return leftpad;
}
inline void dumpMap(const kmap_t& m, int k, Alphabet alpha, std::string outputpath) {
const int CHUNKSIZE = 10000;
// std::ofstream file(outputpath);
// if (file.fail()) throw std::runtime_error("Error opening output file");
std::FILE* file = std::fopen(outputpath.c_str(), "w");
if (!file) throw std::runtime_error("Error opening output file");
ProgressSpinner spinner{option::PostfixText{"Writing to " + outputpath},
option::ForegroundColor{Color::yellow},
option::FontStyles{std::vector<FontStyle>{FontStyle::bold}}};
auto alphabet = Alpha::getAlphabet(alpha);
// Title line:
// KMER, char0, char1, char2, ..., ']'
// we ignore the last column since it's the begin character. It can't be in the counts
std::fprintf(file, "%s", "kmer");
for (size_t i = 0; i < alphabet.size() - 1; ++i) fprintf(file, ", %c", alphabet[i]);
std::fprintf(file, "%s", "\n");
int c = 0;
std::string buff;
for (const auto& kv : m) {
auto decoded = decodeSequenceView(kv.first, alpha);
auto leftpad = raw_seq_t(k - decoded.size(), '[');
buff.insert(buff.end(), leftpad.begin(), leftpad.end());
buff.insert(buff.end(), decoded.begin(), decoded.end());
for (size_t i = 0; i < kv.second.size() - 1; ++i)
buff += ", " + std::to_string(kv.second[i]);
buff += "\n";
if (++c % CHUNKSIZE == 0) {
std::fwrite(buff.c_str(), 1, buff.size(), file);
buff.clear();
spinner.set_progress(static_cast<int>(((float)c / (float)m.size()) * 100));
}
}
std::fwrite(buff.c_str(), 1, buff.size(), file);
std::fclose(file);
spinner.set_option(option::ForegroundColor{Color::green});
spinner.set_option(option::PrefixText{"✔"});
spinner.set_option(option::ShowSpinner{false});
spinner.set_option(option::ShowPercentage{false});
spinner.set_option(option::PostfixText{"All k-mers written!"});
spinner.mark_as_completed();
}
// merge multiple multikmaps into one multikmap
inline void mergeMultiMaps(multikmap_t& res, std::vector<multikmap_t>& maps,
size_t startIndex = 0,
DynamicProgress<ProgressBar>* bars = nullptr) {
size_t N = maps.size();
size_t entries = 0;
if (bars)
for (size_t m = startIndex; m < N; ++m) entries += maps[m].size();
PBar p{bars, entries, "Merging " + std::to_string(N) + " K-Maps in memory"};
for (size_t m = startIndex; m < N; ++m) {
for (auto& [kmer, map] : maps[m]) {
if (!res.count(kmer))
res[kmer] = decltype(map)(std::move(map));
else {
for (auto& [dataset, counts] : map) {
if (res[kmer].count(dataset)) {
assert(res[kmer].size() == counts.size());
auto& ref = res[kmer][dataset];
for (size_t i = 0; i < counts.size(); ++i) ref[i] += counts[i];
} else {
res[kmer][dataset] = counts;
}
}
}
p.step();
}
}
p.completeMsg("✔ Merged " + std::to_string(maps.size()) + " kmaps. Cleaning memory...");
for (size_t m = startIndex; m < N; ++m) maps[m].clear(); // avoid wasting memory
p.complete();
}