-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathSubwordEncoder.h
37 lines (28 loc) · 1.34 KB
/
SubwordEncoder.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#pragma once
#include <string>
#include <vector>
#include "onmt/opennmttokenizer_export.h"
#include "onmt/Tokenizer.h"
namespace onmt
{
class OPENNMTTOKENIZER_EXPORT SubwordEncoder
{
public:
virtual ~SubwordEncoder() = default;
// Maybe update the tokenization options for this subword encoder.
virtual void update_tokenization_options(Tokenizer::Options& options) const;
virtual void load_vocabulary(const std::string& path,
int frequency_threshold,
const Tokenizer::Options* tokenization_options = nullptr);
virtual void set_vocabulary(const std::vector<std::string>& vocabulary,
const Tokenizer::Options* tokenization_options = nullptr);
virtual void reset_vocabulary();
virtual std::vector<std::string> encode(const std::string& str,
bool training = true) const = 0;
virtual std::vector<Token> encode_and_annotate(const Token& token,
bool training = true) const = 0;
virtual std::vector<Token> encode_and_annotate(const std::vector<Token>& tokens,
bool training = true) const;
static void propagate_token_properties(const Token& token, std::vector<Token>& tokens);
};
}