-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathhomopolymer.cpp
More file actions
210 lines (181 loc) · 6.85 KB
/
homopolymer.cpp
File metadata and controls
210 lines (181 loc) · 6.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#include "sarlacc.h"
#include <deque>
#include <stdexcept>
struct rle_walker {
rle_walker(const char * p, const size_t l) : ptr(p), len(l),
last_pos(0), cur_pos(0), nonbases(0), last_base(0), next_base(0),
true_last_pos(0) {
while (cur_pos < len) {
next_base=DNAdecode(ptr[cur_pos]);
if (next_base!='-') {
break;
}
++nonbases;
++cur_pos;
}
return;
}
// Advances to the next run.
void advance() {
last_pos=cur_pos;
true_last_pos=last_pos - nonbases; // before 'nonbases' is modified.
last_base=next_base;
++cur_pos;
while (cur_pos < len) {
next_base=DNAdecode(ptr[cur_pos]);
if (next_base!='-' && next_base!=last_base) {
break;
}
++cur_pos;
if (next_base=='-') {
++nonbases;
}
}
return;
}
bool is_finished () const { return (cur_pos==len); }
// Direct statistics on the current run.
size_t get_start() const { return true_last_pos; }
size_t get_length() const { return (cur_pos - nonbases) - true_last_pos; }
char get_base() const { return last_base; }
// Statistics on the current run with respect to alignment position (i.e., gap characters).
size_t get_run_start() const { return last_pos; }
size_t get_run_start_with_gaps() const {
size_t pos=last_pos;
while (pos > 0) {
--pos;
if (DNAdecode(ptr[pos])!='-') {
++pos;
break;
}
}
return pos;
}
size_t get_run_end() const {
size_t pos=cur_pos;
while (pos > last_pos) {
--pos;
if (DNAdecode(ptr[pos])!='-') {
++pos;
break;
}
}
return pos;
}
size_t get_run_end_with_gaps() const { return cur_pos; }
private:
const char * ptr;
const size_t len;
size_t last_pos, cur_pos, nonbases, true_last_pos;
char last_base, next_base;
};
/* This function identifies homopolymers in a given set of sequences,
* returning the position, length and base of the homopolymer.
*/
SEXP find_homopolymers (SEXP sequences) {
BEGIN_RCPP
// Checking inputs.
auto seq=hold_XStringSet(sequences);
const size_t nseq=get_length_from_XStringSet_holder(&seq);
// Creating outputs.
std::deque<int> collected_index, collected_pos, collected_size;
std::deque<char> collected_base;
// Running through each string and returning the number of times we see a homopolymer.
for (size_t i=0; i<nseq; ++i) {
auto curseq=get_elt_from_XStringSet_holder(&seq, i);
const char* sstr=curseq.ptr;
const size_t slen=curseq.length;
rle_walker SQ(sstr, slen);
while (!SQ.is_finished()) {
SQ.advance();
const size_t homolen=SQ.get_length();
if (homolen==1) {
continue;
}
collected_index.push_back(i);
collected_pos.push_back(SQ.get_start()+1);
collected_size.push_back(homolen);
collected_base.push_back(SQ.get_base());
}
}
Rcpp::List output(4);
output[0]=Rcpp::IntegerVector(collected_index.begin(), collected_index.end());
output[1]=Rcpp::IntegerVector(collected_pos.begin(), collected_pos.end());
output[2]=Rcpp::IntegerVector(collected_size.begin(), collected_size.end());
// Bit more effort to convert character strings.
Rcpp::StringVector bases(collected_base.size());
auto bIt=bases.begin();
char buffer[2];
buffer[1]='\0';
for (auto c : collected_base) {
buffer[0]=c;
*(bIt++) = Rcpp::String(buffer);
}
output[3]=bases;
return output;
END_RCPP
}
/* This function identifies the homopolymer in a reference alignment string,
* and returns the number of observed homopolymers in the read alignment string.
* Some effort is required to handle the gap character '-'.
*/
SEXP match_homopolymers (SEXP ref_align, SEXP read_align) {
BEGIN_RCPP
auto rf_aln=hold_XStringSet(ref_align);
const size_t nalign=get_length_from_XStringSet_holder(&rf_aln);
auto rd_aln=hold_XStringSet(read_align);
if (nalign!=get_length_from_XStringSet_holder(&rd_aln)) {
throw std::runtime_error("lengths of alignment vectors should match up");
}
// Setting up output containers.
std::deque<int> collected_index, collected_pos, collected_rlen;
// Running through the pairwise aligners and reporting the observed length of all homopolymers.
for (size_t i=0; i<nalign; ++i) {
auto curref=get_elt_from_XStringSet_holder(&rf_aln, i);
const char* refstr=curref.ptr;
const size_t reflen=curref.length;
auto curread=get_elt_from_XStringSet_holder(&rd_aln, i);
const char* readstr=curread.ptr;
const size_t readlen=curread.length;
if (readlen!=reflen) {
throw std::runtime_error("read and reference alignment strings should have equal length");
}
if (reflen==0) {
continue;
}
// Iterating across the reference and counting the number of homopolymers in the read.
rle_walker ref_SQ(refstr, reflen);
while (!ref_SQ.is_finished()) {
ref_SQ.advance();
const size_t homolen=ref_SQ.get_length();
if (homolen==1) {
continue;
}
collected_index.push_back(i);
collected_pos.push_back(ref_SQ.get_start()+1);
const char curbase=ref_SQ.get_base();
const size_t farleft=ref_SQ.get_run_start_with_gaps(),
farright=ref_SQ.get_run_end_with_gaps();
const size_t left=ref_SQ.get_run_start(),
right=ref_SQ.get_run_end();
// Picking the longest homopolymer stretch in the read that overlaps with the current homopolymer.
rle_walker read_SQ(readstr + farleft, farright - farleft);
size_t maxlen=0;
while (!read_SQ.is_finished()) {
read_SQ.advance();
if (right > read_SQ.get_run_start() + farleft &&
left < read_SQ.get_run_end() + farleft) {
const size_t curlen=read_SQ.get_length();
if (curlen > maxlen && read_SQ.get_base()==curbase) {
maxlen=curlen;
}
}
}
collected_rlen.push_back(maxlen);
}
}
return Rcpp::List::create(Rcpp::IntegerVector(collected_index.begin(), collected_index.end()),
Rcpp::IntegerVector(collected_pos.begin(), collected_pos.end()),
Rcpp::IntegerVector(collected_rlen.begin(), collected_rlen.end()));
END_RCPP
}