Skip to content

Commit

Permalink
Add training flag to enable/disable subword regularization (#233)
Browse files Browse the repository at this point in the history
* Add training flag to enable/disable subword regularization

* Disable training mode when annotating tokens
  • Loading branch information
guillaumekln authored Mar 15, 2021
1 parent b42a4a5 commit 8176188
Show file tree
Hide file tree
Showing 16 changed files with 103 additions and 61 deletions.
13 changes: 11 additions & 2 deletions bindings/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,26 @@ See the [documentation](../../docs/options.md) for a description of each tokeniz

```python
# By default, tokenize returns the tokens and features.
tokenizer.tokenize(text: str) -> Tuple[List[str], List[List[str]]]
# When training=False, subword regularization such as BPE dropout is disabled.
tokenizer.tokenize(
text: str,
training: bool = True,
) -> Tuple[List[str], List[List[str]]]

# The as_token_objects flag can alternatively return Token objects (see below).
tokenizer.tokenize(text: str, as_token_objects=True) -> List[pyonmttok.Token]
tokenizer.tokenize(
text: str,
as_token_objects=True,
training: bool = True,
) -> List[pyonmttok.Token]

# Tokenize a file.
tokenizer.tokenize_file(
input_path: str,
output_path: str,
num_threads: int = 1,
verbose: bool = False,
training: bool = True,
)
```

Expand Down
19 changes: 12 additions & 7 deletions bindings/python/pyonmttok/Python.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,16 @@ class TokenizerWrapper
);
}

py::object tokenize(const std::string& text, const bool as_token_objects) const
py::object tokenize(const std::string& text,
const bool as_token_objects,
const bool training) const
{
if (as_token_objects)
{
std::vector<onmt::Token> tokens;
{
py::gil_scoped_release release;
_tokenizer->tokenize(text, tokens);
_tokenizer->tokenize(text, tokens, training);
}
return py::cast(tokens);
}
Expand All @@ -150,7 +152,7 @@ class TokenizerWrapper

{
py::gil_scoped_release release;
_tokenizer->tokenize(text, words, features);
_tokenizer->tokenize(text, words, features, training);
}

return py::make_tuple(py::cast(words), features.empty() ? py::none() : py::cast(features));
Expand Down Expand Up @@ -242,7 +244,8 @@ class TokenizerWrapper
void tokenize_file(const std::string& input_path,
const std::string& output_path,
int num_threads,
bool verbose)
bool verbose,
bool training)
{
std::ifstream in(input_path);
if (!in)
Expand All @@ -251,7 +254,7 @@ class TokenizerWrapper
if (!out)
throw std::invalid_argument("Failed to open output file " + output_path);
py::gil_scoped_release release;
_tokenizer->tokenize_stream(in, out, num_threads, verbose);
_tokenizer->tokenize_stream(in, out, num_threads, verbose, training);
}

void detokenize_file(const std::string& input_path,
Expand Down Expand Up @@ -594,7 +597,8 @@ PYBIND11_MODULE(_ext, m)
.def_property_readonly("options", &TokenizerWrapper::get_options)
.def("tokenize", &TokenizerWrapper::tokenize,
py::arg("text"),
py::arg("as_token_objects")=false)
py::arg("as_token_objects")=false,
py::arg("training")=true)
.def("serialize_tokens", &TokenizerWrapper::serialize_tokens,
py::arg("tokens"))
.def("deserialize_tokens", &TokenizerWrapper::deserialize_tokens,
Expand All @@ -603,7 +607,8 @@ PYBIND11_MODULE(_ext, m)
py::arg("input_path"),
py::arg("output_path"),
py::arg("num_threads")=1,
py::arg("verbose")=false)
py::arg("verbose")=false,
py::arg("training")=true)
.def("detokenize", &TokenizerWrapper::detokenize,
py::arg("tokens"), py::arg("features")=py::none())
.def("detokenize_with_ranges", &TokenizerWrapper::detokenize_with_ranges,
Expand Down
4 changes: 3 additions & 1 deletion bindings/python/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_detok_with_ranges():
assert ranges[0] == (0, 1)
assert ranges[1] == (0, 1)

def test_random_seed():
def test_subword_regularization():
pyonmttok.set_random_seed(42)

tokenizer = pyonmttok.Tokenizer(
Expand All @@ -187,12 +187,14 @@ def test_random_seed():
sp_nbest_size=10,
sp_alpha=0.1)
assert tokenizer.tokenize("appealing")[0] == ["▁app", "e", "al", "ing"]
assert tokenizer.tokenize("appealing", training=False)[0] == ["▁appealing"]

tokenizer = pyonmttok.Tokenizer(
"conservative",
bpe_model_path=os.path.join(_DATA_DIR, "bpe-models", "testcode.v0.1"),
bpe_dropout=0.3)
assert tokenizer.tokenize("improvement")[0] == ["i", "m", "pr", "ove", "m", "e", "n", "t"]
assert tokenizer.tokenize("improvement", training=False)[0] == ["impr", "ovemen", "t"]

def test_bpe_case_insensitive_issue_147():
tokenizer = pyonmttok.Tokenizer(
Expand Down
6 changes: 3 additions & 3 deletions include/onmt/BPE.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace onmt
BPE(const std::string& model_path, const float dropout = 0);
BPE(const std::string& model_path, const std::string& joiner, const float dropout = 0);

std::vector<std::string> encode(const std::string& str) const override;
std::vector<Token> encode_and_annotate(const Token& token) const override;
std::vector<std::string> encode(const std::string& str, bool training = true) const override;
std::vector<Token> encode_and_annotate(const Token& token, bool training = true) const override;

void set_vocabulary(const std::vector<std::string>& vocabulary,
const Tokenizer::Options* options = nullptr) override;
Expand Down Expand Up @@ -57,7 +57,7 @@ namespace onmt
void load_model(const std::string& model_path);

int get_score(const std::string& gram1, const std::string& gram2) const;
void apply_merges(std::vector<std::string>& chars) const;
void apply_merges(std::vector<std::string>& chars, bool training) const;

bool in_vocabulary(const std::string& token) const;
bool in_vocabulary(const onmt::Token& token) const;
Expand Down
11 changes: 8 additions & 3 deletions include/onmt/ITokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@ namespace onmt

virtual void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features) const = 0;
std::vector<std::vector<std::string> >& features,
bool training = true) const = 0;
virtual void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>& alphabets) const;
virtual void tokenize(const std::string& text, std::vector<std::string>& words) const;
std::unordered_map<std::string, size_t>& alphabets,
bool training = true) const;
virtual void tokenize(const std::string& text,
std::vector<std::string>& words,
bool training = true) const;

virtual std::string detokenize(const std::vector<std::string>& words,
const std::vector<std::vector<std::string> >& features) const = 0;
Expand All @@ -43,6 +47,7 @@ namespace onmt
std::ostream& os,
size_t num_threads = 1,
bool verbose = false,
bool training = true,
size_t buffer_size = 1000) const;

virtual void detokenize_stream(std::istream& is, std::ostream& os) const;
Expand Down
4 changes: 2 additions & 2 deletions include/onmt/SentencePiece.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ namespace onmt
void reset_vocabulary() override;
void enable_regularization(int nbest_size, float alpha);

std::vector<std::string> encode(const std::string& str) const override;
std::vector<Token> encode_and_annotate(const Token& token) const override;
std::vector<std::string> encode(const std::string& str, bool training = true) const override;
std::vector<Token> encode_and_annotate(const Token& token, bool training = true) const override;

private:
const std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
Expand Down
3 changes: 2 additions & 1 deletion include/onmt/SpaceTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ namespace onmt

void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features) const override;
std::vector<std::vector<std::string> >& features,
bool) const override;

std::string detokenize(const std::vector<std::string>& words,
const std::vector<std::vector<std::string> >& features) const override;
Expand Down
9 changes: 6 additions & 3 deletions include/onmt/SubwordEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ namespace onmt
const Tokenizer::Options* tokenization_options = nullptr);
virtual void reset_vocabulary();

virtual std::vector<std::string> encode(const std::string& str) const = 0;
virtual std::vector<Token> encode_and_annotate(const Token& token) const = 0;
virtual std::vector<Token> encode_and_annotate(const std::vector<Token>& tokens) const;
virtual std::vector<std::string> encode(const std::string& str,
bool training = true) const = 0;
virtual std::vector<Token> encode_and_annotate(const Token& token,
bool training = true) const = 0;
virtual std::vector<Token> encode_and_annotate(const std::vector<Token>& tokens,
bool training = true) const;

static void propagate_token_properties(const Token& token, std::vector<Token>& tokens);
};
Expand Down
18 changes: 11 additions & 7 deletions include/onmt/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,17 @@ namespace onmt

void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features) const override;

std::vector<std::vector<std::string> >& features,
bool training = true) const override;
void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>& alphabets) const override;

std::unordered_map<std::string, size_t>& alphabets,
bool training = true) const override;
void tokenize(const std::string& text,
std::vector<Token>& annotated_tokens) const;
std::vector<Token>& annotated_tokens,
bool training = true) const;

Token annotate_token(const std::string& word) const;
void annotate_tokens(const std::vector<std::string>& words,
const std::vector<std::vector<std::string>>& features,
Expand Down Expand Up @@ -127,11 +129,13 @@ namespace onmt

void tokenize(const std::string& text,
std::vector<Token>& annotated_tokens,
std::unordered_map<std::string, size_t>* alphabets) const;
std::unordered_map<std::string, size_t>* alphabets,
bool training) const;
void tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>* alphabets) const;
std::unordered_map<std::string, size_t>* alphabets,
bool training) const;
std::string detokenize(const std::vector<Token>& tokens,
Ranges* ranges,
bool merge_ranges = false,
Expand Down
12 changes: 6 additions & 6 deletions src/BPE.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ namespace onmt
return pieces;
}

std::vector<std::string> BPE::encode(const std::string& str) const
std::vector<std::string> BPE::encode(const std::string& str, bool training) const
{
const auto chars_info = unicode::get_characters_info(str);
std::vector<std::string> chars = get_initial_pieces(chars_info, _case_insensitive);
Expand All @@ -182,7 +182,7 @@ namespace onmt
chars.push_back(_end_of_word);
}

apply_merges(chars);
apply_merges(chars, training);

if (_prefix && starts_with(chars.front(), _begin_of_word))
{
Expand Down Expand Up @@ -233,9 +233,9 @@ namespace onmt
return chars;
}

std::vector<Token> BPE::encode_and_annotate(const Token& token) const
std::vector<Token> BPE::encode_and_annotate(const Token& token, bool training) const
{
std::vector<std::string> encoded = encode(token.surface);
std::vector<std::string> encoded = encode(token.surface, training);
std::vector<Token> tokens;
tokens.reserve(encoded.size());

Expand Down Expand Up @@ -273,7 +273,7 @@ namespace onmt
return std::numeric_limits<int>::max();
}

void BPE::apply_merges(std::vector<std::string>& chars) const
void BPE::apply_merges(std::vector<std::string>& chars, bool training) const
{
// Compute score for all pairs.
std::vector<int> scores;
Expand All @@ -289,7 +289,7 @@ namespace onmt

for (size_t i = 0; i < scores.size(); ++i)
{
if (_dropout != 0)
if (training && _dropout != 0)
{
std::uniform_real_distribution<float> dist;
const float sample = dist(get_random_generator());
Expand Down
16 changes: 10 additions & 6 deletions src/ITokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,18 @@ namespace onmt
void ITokenizer::tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features,
std::unordered_map<std::string, size_t>&) const
std::unordered_map<std::string, size_t>&,
bool training) const
{
tokenize(text, words, features);
tokenize(text, words, features, training);
}

void ITokenizer::tokenize(const std::string& text, std::vector<std::string>& words) const
void ITokenizer::tokenize(const std::string& text,
std::vector<std::string>& words,
bool training) const
{
std::vector<std::vector<std::string> > features;
tokenize(text, words, features);
tokenize(text, words, features, training);
}

std::string ITokenizer::detokenize(const std::vector<std::string>& words) const
Expand All @@ -175,14 +178,15 @@ namespace onmt
std::ostream& out,
size_t num_threads,
bool verbose,
bool training,
size_t buffer_size) const
{
using Result = std::pair<std::vector<std::string>, std::vector<std::vector<std::string>>>;
auto function = [this](const std::string& text)
auto function = [this, training](const std::string& text)
{
std::vector<std::string> words;
std::vector<std::vector<std::string>> features;
this->tokenize(text, words, features);
this->tokenize(text, words, features, training);
return Result(std::move(words), std::move(features));
};
auto writer = [](std::ostream& os, const Result& result)
Expand Down
8 changes: 4 additions & 4 deletions src/SentencePiece.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,21 @@ namespace onmt
_alpha = alpha;
}

std::vector<std::string> SentencePiece::encode(const std::string& str) const
std::vector<std::string> SentencePiece::encode(const std::string& str, bool training) const
{
std::vector<std::string> pieces;

if (_nbest_size != 0)
if (training && _nbest_size != 0)
_processor->SampleEncode(str, _nbest_size, _alpha, &pieces);
else
_processor->Encode(str, &pieces);

return pieces;
}

std::vector<Token> SentencePiece::encode_and_annotate(const Token& token) const
std::vector<Token> SentencePiece::encode_and_annotate(const Token& token, bool training) const
{
std::vector<std::string> pieces = encode(token.surface);
std::vector<std::string> pieces = encode(token.surface, training);

// SentencePiece sometimes returns no pieces for a non empty input. In this case
// we simply return the original token.
Expand Down
3 changes: 2 additions & 1 deletion src/SpaceTokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ namespace onmt

void SpaceTokenizer::tokenize(const std::string& text,
std::vector<std::string>& words,
std::vector<std::vector<std::string> >& features) const
std::vector<std::vector<std::string> >& features,
bool) const
{
words.reserve(text.length());

Expand Down
5 changes: 3 additions & 2 deletions src/SubwordEncoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ namespace onmt
{
}

std::vector<Token> SubwordEncoder::encode_and_annotate(const std::vector<Token>& tokens) const
std::vector<Token> SubwordEncoder::encode_and_annotate(const std::vector<Token>& tokens,
bool training) const
{
std::vector<Token> segments;
segments.reserve(tokens.size() * 2);
Expand All @@ -68,7 +69,7 @@ namespace onmt
continue;
}

std::vector<Token> sub_segments = encode_and_annotate(token);
std::vector<Token> sub_segments = encode_and_annotate(token, training);
segments.insert(segments.end(),
std::make_move_iterator(sub_segments.begin()),
std::make_move_iterator(sub_segments.end()));
Expand Down
Loading

0 comments on commit 8176188

Please sign in to comment.