-
Notifications
You must be signed in to change notification settings - Fork 0
/
ntHashIterator.hpp
139 lines (114 loc) · 2.99 KB
/
ntHashIterator.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#ifndef NTHASH__ITERATOR_H
#define NTHASH__ITERATOR_H 1
#include <string>
#include <limits>
#include "nthash.hpp"
/**
* Iterate over hash values for k-mers in a
* given DNA sequence.
*
* This implementation uses ntHash
* function to efficiently calculate
* hash values for successive k-mers.
*/
class ntHashIterator
{
public:
/**
* Default constructor. Creates an iterator pointing to
* the end of the iterator range.
*/
ntHashIterator():
m_hVec(NULL),
m_pos(std::numeric_limits<std::size_t>::max())
{}
/**
* Constructor.
* @param seq address of DNA sequence to be hashed
* @param k k-mer size
* @param h number of hashes
*/
ntHashIterator(const std::string& seq, unsigned h, unsigned k):
m_seq(seq), m_h(h), m_k(k), m_hVec(new uint64_t[h]), m_pos(0)
{
init();
}
/** Initialize internal state of iterator */
void init()
{
if (m_k > m_seq.length()) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
unsigned locN=0;
while (m_pos<m_seq.length()-m_k+1 && !NTMC64(m_seq.data()+m_pos, m_k, m_h, m_fhVal, m_rhVal, locN, m_hVec))
m_pos+=locN+1;
if (m_pos >= m_seq.length()-m_k+1)
m_pos = std::numeric_limits<std::size_t>::max();
}
/** Advance iterator right to the next valid k-mer */
void next()
{
++m_pos;
if (m_pos >= m_seq.length()-m_k+1) {
m_pos = std::numeric_limits<std::size_t>::max();
return;
}
if(seedTab[(unsigned char)(m_seq.at(m_pos+m_k-1))]==seedN) {
m_pos+=m_k;
init();
}
else
NTMC64(m_seq.at(m_pos-1), m_seq.at(m_pos-1+m_k), m_k, m_h, m_fhVal, m_rhVal, m_hVec);
}
size_t pos() const{
return m_pos;
}
/** get pointer to hash values for current k-mer */
const uint64_t* operator*() const
{
return m_hVec;
}
/** test equality with another iterator */
bool operator==(const ntHashIterator& it) const
{
return m_pos == it.m_pos;
}
/** test inequality with another iterator */
bool operator!=(const ntHashIterator& it) const
{
return !(*this == it);
}
/** pre-increment operator */
ntHashIterator& operator++()
{
next();
return *this;
}
/** iterator pointing to one past last element */
static const ntHashIterator end()
{
return ntHashIterator();
}
/** destructor */
~ntHashIterator() {
if(m_hVec!=NULL)
delete [] m_hVec;
}
private:
/** DNA sequence */
std::string m_seq;
/** number of hashes */
unsigned m_h;
/** k-mer size */
unsigned m_k;
/** hash values */
uint64_t *m_hVec;
/** position of current k-mer */
size_t m_pos;
/** forward-strand k-mer hash value */
uint64_t m_fhVal;
/** reverse-complement k-mer hash value */
uint64_t m_rhVal;
};
#endif