-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtoken_vector.h
52 lines (38 loc) · 984 Bytes
/
token_vector.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#pragma once
#include <string>
#include <vector>
namespace ngram_tokenizer {
typedef enum {
DIGIT,
SPACE_OR_CONTROL,
ALPHABETIC,
PUNCTUATION,
OTHER
} token_category_t;
class Token {
public:
Token(std::string, int, int, token_category_t);
const std::string &get_str() const;
int get_iStart() const;
int get_iEnd() const;
token_category_t get_category() const;
private:
std::string str;
int iStart; // Inclusive
int iEnd; // Exclusive
token_category_t category;
};
class TokenVector {
public:
TokenVector(const char *, int);
bool tokenize();
const std::vector<Token> &get_tokens() const;
private:
static token_category_t token_category(char);
static int utf8_char_count(char);
const char *pText;
int nText;
std::vector<Token> tokens;
bool ok;
};
}