-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcompute_lev_masked.cpp
More file actions
64 lines (50 loc) · 1.8 KB
/
compute_lev_masked.cpp
File metadata and controls
64 lines (50 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include "sarlacc.h"
#include "DNA_input.h"
#include <vector>
#include <algorithm>
/* This function computes a masked version of the Levenshtein distance,
* where N's always contribute a mismatch no matter what they are matched to.
* In effect, they are treated as 'missing' values.
*/
SEXP compute_lev_masked(SEXP sequences) {
BEGIN_RCPP
auto seqs=process_DNA_input(sequences);
const size_t nseq=seqs->size();
Rcpp::NumericVector output((nseq-1)*nseq/2);
auto oIt=output.begin();
// Setting up the odds and ends required
std::vector<double> col, prev_col;
col.reserve(50);
prev_col.reserve(50);
for (size_t i=0; i<nseq; ++i) {
auto iseq=seqs->get(i);
const char* istr=iseq.first;
const size_t ilen=iseq.second;
if (prev_col.size() < ilen) {
col.resize(ilen + 1);
prev_col.resize(ilen + 1);
}
auto altseqs=process_DNA_input(sequences);
for (size_t j=i+1; j<nseq; ++j) {
auto jseq=altseqs->get(j);
const char* jstr=jseq.first;
const size_t jlen=jseq.second;
// Computing the levenshtein distance.
std::iota(prev_col.begin(), prev_col.end(), 0);
for (size_t jx=0; jx<jlen; ++jx) {
col[0]=jx+1;
const char jbase=jstr[jx];
for (size_t ix=0; ix<ilen; ++ix) {
const char ibase=istr[ix];
const double match_score = (jbase=='N' || ibase=='N' ? 0.5 : (jbase==ibase ? 0 : 1));
col[ix+1] = std::min({ prev_col[ix+1] + 1, col[ix] + 1, prev_col[ix] + match_score });
}
col.swap(prev_col);
}
(*oIt)=prev_col[ilen];
++oIt;
}
}
return output;
END_RCPP
}