-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathToken.h
83 lines (68 loc) · 1.5 KB
/
Token.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#pragma once
#include <string>
#include <vector>
#include "onmt/opennmttokenizer_export.h"
namespace onmt
{
enum class Casing
{
None,
Lowercase,
Uppercase,
Mixed,
Capitalized,
};
enum class TokenType
{
Word,
LeadingSubword,
TrailingSubword,
};
class OPENNMTTOKENIZER_EXPORT Token
{
public:
std::string surface;
TokenType type = TokenType::Word;
Casing casing = Casing::None;
bool join_left = false;
bool join_right = false;
bool spacer = false;
bool preserve = false;
std::vector<std::string> features;
Token() = default;
Token(std::string str)
: surface(std::move(str))
{
}
template <typename... Args>
void append(Args&&... args)
{
surface.append(std::forward<Args>(args)...);
}
bool empty() const
{
return surface.empty();
}
bool is_placeholder() const;
size_t unicode_length() const;
void append_feature(std::string feature)
{
features.emplace_back(std::move(feature));
}
bool has_features() const
{
return !features.empty();
}
bool operator==(const Token& other) const
{
return (surface == other.surface
&& type == other.type
&& casing == other.casing
&& join_left == other.join_left
&& join_right == other.join_right
&& spacer == other.spacer
&& preserve == other.preserve
&& features == other.features);
}
};
}