-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathSentencePiece.h
39 lines (30 loc) · 1.08 KB
/
SentencePiece.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#pragma once
#include <memory>
#include <string>
#include "onmt/opennmttokenizer_export.h"
#include "onmt/SubwordEncoder.h"
namespace sentencepiece
{
class SentencePieceProcessor;
}
namespace onmt
{
class OPENNMTTOKENIZER_EXPORT SentencePiece: public SubwordEncoder
{
public:
SentencePiece(const std::string& model_path);
SentencePiece(const std::string& model_path, int nbest_size, float alpha);
~SentencePiece();
void update_tokenization_options(Tokenizer::Options& options) const override;
void set_vocabulary(const std::vector<std::string>& vocabulary,
const Tokenizer::Options* options = nullptr) override;
void reset_vocabulary() override;
void enable_regularization(int nbest_size, float alpha);
std::vector<std::string> encode(const std::string& str, bool training = true) const override;
std::vector<Token> encode_and_annotate(const Token& token, bool training = true) const override;
private:
const std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
int _nbest_size;
float _alpha;
};
}