Add training flag to enable/disable subword regularization (#233)

* Add training flag to enable/disable subword regularization * Disable training mode when annotating tokens
OpenNMT · Mar 15, 2021 · 8176188 · 8176188
1 parent b42a4a5
commit 8176188
Show file tree

Hide file tree

Showing 16 changed files with 103 additions and 61 deletions.
diff --git a/bindings/python/README.md b/bindings/python/README.md
@@ -81,17 +81,26 @@ See the [documentation](../../docs/options.md) for a description of each tokeniz
 
 ```python
 # By default, tokenize returns the tokens and features.
-tokenizer.tokenize(text: str) -> Tuple[List[str], List[List[str]]]
+# When training=False, subword regularization such as BPE dropout is disabled.
+tokenizer.tokenize(
+    text: str,
+    training: bool = True,
+) -> Tuple[List[str], List[List[str]]]
 
 # The as_token_objects flag can alternatively return Token objects (see below).
-tokenizer.tokenize(text: str, as_token_objects=True) -> List[pyonmttok.Token]
+tokenizer.tokenize(
+    text: str,
+    as_token_objects=True,
+    training: bool = True,
+) -> List[pyonmttok.Token]
 
 # Tokenize a file.
 tokenizer.tokenize_file(
     input_path: str,
     output_path: str,
     num_threads: int = 1,
     verbose: bool = False,
+    training: bool = True,
 )
 ```
 

diff --git a/bindings/python/pyonmttok/Python.cc b/bindings/python/pyonmttok/Python.cc
@@ -133,14 +133,16 @@ class TokenizerWrapper
       );
   }
 
-  py::object tokenize(const std::string& text, const bool as_token_objects) const
+  py::object tokenize(const std::string& text,
+                      const bool as_token_objects,
+                      const bool training) const
   {
     if (as_token_objects)
     {
       std::vector<onmt::Token> tokens;
       {
         py::gil_scoped_release release;
-        _tokenizer->tokenize(text, tokens);
+        _tokenizer->tokenize(text, tokens, training);
       }
       return py::cast(tokens);
     }
@@ -150,7 +152,7 @@ class TokenizerWrapper
 
     {
       py::gil_scoped_release release;
-      _tokenizer->tokenize(text, words, features);
+      _tokenizer->tokenize(text, words, features, training);
     }
 
     return py::make_tuple(py::cast(words), features.empty() ? py::none() : py::cast(features));
@@ -242,7 +244,8 @@ class TokenizerWrapper
   void tokenize_file(const std::string& input_path,
                      const std::string& output_path,
                      int num_threads,
-                     bool verbose)
+                     bool verbose,
+                     bool training)
   {
     std::ifstream in(input_path);
     if (!in)
@@ -251,7 +254,7 @@ class TokenizerWrapper
     if (!out)
       throw std::invalid_argument("Failed to open output file " + output_path);
     py::gil_scoped_release release;
-    _tokenizer->tokenize_stream(in, out, num_threads, verbose);
+    _tokenizer->tokenize_stream(in, out, num_threads, verbose, training);
   }
 
   void detokenize_file(const std::string& input_path,
@@ -594,7 +597,8 @@ PYBIND11_MODULE(_ext, m)
     .def_property_readonly("options", &TokenizerWrapper::get_options)
     .def("tokenize", &TokenizerWrapper::tokenize,
          py::arg("text"),
-         py::arg("as_token_objects")=false)
+         py::arg("as_token_objects")=false,
+         py::arg("training")=true)
     .def("serialize_tokens", &TokenizerWrapper::serialize_tokens,
          py::arg("tokens"))
     .def("deserialize_tokens", &TokenizerWrapper::deserialize_tokens,
@@ -603,7 +607,8 @@ PYBIND11_MODULE(_ext, m)
          py::arg("input_path"),
          py::arg("output_path"),
          py::arg("num_threads")=1,
-         py::arg("verbose")=false)
+         py::arg("verbose")=false,
+         py::arg("training")=true)
     .def("detokenize", &TokenizerWrapper::detokenize,
          py::arg("tokens"), py::arg("features")=py::none())
     .def("detokenize_with_ranges", &TokenizerWrapper::detokenize_with_ranges,

diff --git a/bindings/python/test/test.py b/bindings/python/test/test.py
@@ -178,7 +178,7 @@ def test_detok_with_ranges():
     assert ranges[0] == (0, 1)
     assert ranges[1] == (0, 1)
 
-def test_random_seed():
+def test_subword_regularization():
     pyonmttok.set_random_seed(42)
 
     tokenizer = pyonmttok.Tokenizer(
@@ -187,12 +187,14 @@ def test_random_seed():
         sp_nbest_size=10,
         sp_alpha=0.1)
     assert tokenizer.tokenize("appealing")[0] == ["▁app", "e", "al", "ing"]
+    assert tokenizer.tokenize("appealing", training=False)[0] == ["▁appealing"]
 
     tokenizer = pyonmttok.Tokenizer(
         "conservative",
         bpe_model_path=os.path.join(_DATA_DIR, "bpe-models", "testcode.v0.1"),
         bpe_dropout=0.3)
     assert tokenizer.tokenize("improvement")[0] == ["i", "m", "pr", "ove", "m", "e", "n", "t"]
+    assert tokenizer.tokenize("improvement", training=False)[0] == ["impr", "ovemen", "t"]
 
 def test_bpe_case_insensitive_issue_147():
     tokenizer = pyonmttok.Tokenizer(

diff --git a/include/onmt/BPE.h b/include/onmt/BPE.h
@@ -18,8 +18,8 @@ namespace onmt
     BPE(const std::string& model_path, const float dropout = 0);
     BPE(const std::string& model_path, const std::string& joiner, const float dropout = 0);
 
-    std::vector<std::string> encode(const std::string& str) const override;
-    std::vector<Token> encode_and_annotate(const Token& token) const override;
+    std::vector<std::string> encode(const std::string& str, bool training = true) const override;
+    std::vector<Token> encode_and_annotate(const Token& token, bool training = true) const override;
 
     void set_vocabulary(const std::vector<std::string>& vocabulary,
                         const Tokenizer::Options* options = nullptr) override;
@@ -57,7 +57,7 @@ namespace onmt
     void load_model(const std::string& model_path);
 
     int get_score(const std::string& gram1, const std::string& gram2) const;
-    void apply_merges(std::vector<std::string>& chars) const;
+    void apply_merges(std::vector<std::string>& chars, bool training) const;
 
     bool in_vocabulary(const std::string& token) const;
     bool in_vocabulary(const onmt::Token& token) const;

diff --git a/include/onmt/ITokenizer.h b/include/onmt/ITokenizer.h
@@ -23,12 +23,16 @@ namespace onmt
 
     virtual void tokenize(const std::string& text,
                           std::vector<std::string>& words,
-                          std::vector<std::vector<std::string> >& features) const = 0;
+                          std::vector<std::vector<std::string> >& features,
+                          bool training = true) const = 0;
     virtual void tokenize(const std::string& text,
                           std::vector<std::string>& words,
                           std::vector<std::vector<std::string> >& features,
-                          std::unordered_map<std::string, size_t>& alphabets) const;
-    virtual void tokenize(const std::string& text, std::vector<std::string>& words) const;
+                          std::unordered_map<std::string, size_t>& alphabets,
+                          bool training = true) const;
+    virtual void tokenize(const std::string& text,
+                          std::vector<std::string>& words,
+                          bool training = true) const;
 
     virtual std::string detokenize(const std::vector<std::string>& words,
                                    const std::vector<std::vector<std::string> >& features) const = 0;
@@ -43,6 +47,7 @@ namespace onmt
                                  std::ostream& os,
                                  size_t num_threads = 1,
                                  bool verbose = false,
+                                 bool training = true,
                                  size_t buffer_size = 1000) const;
 
     virtual void detokenize_stream(std::istream& is, std::ostream& os) const;

diff --git a/include/onmt/SentencePiece.h b/include/onmt/SentencePiece.h
@@ -27,8 +27,8 @@ namespace onmt
     void reset_vocabulary() override;
     void enable_regularization(int nbest_size, float alpha);
 
-    std::vector<std::string> encode(const std::string& str) const override;
-    std::vector<Token> encode_and_annotate(const Token& token) const override;
+    std::vector<std::string> encode(const std::string& str, bool training = true) const override;
+    std::vector<Token> encode_and_annotate(const Token& token, bool training = true) const override;
 
   private:
     const std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;

diff --git a/include/onmt/SpaceTokenizer.h b/include/onmt/SpaceTokenizer.h
@@ -15,7 +15,8 @@ namespace onmt
 
     void tokenize(const std::string& text,
                   std::vector<std::string>& words,
-                  std::vector<std::vector<std::string> >& features) const override;
+                  std::vector<std::vector<std::string> >& features,
+                  bool) const override;
 
     std::string detokenize(const std::vector<std::string>& words,
                            const std::vector<std::vector<std::string> >& features) const override;

diff --git a/include/onmt/SubwordEncoder.h b/include/onmt/SubwordEncoder.h
@@ -24,9 +24,12 @@ namespace onmt
                                 const Tokenizer::Options* tokenization_options = nullptr);
     virtual void reset_vocabulary();
 
-    virtual std::vector<std::string> encode(const std::string& str) const = 0;
-    virtual std::vector<Token> encode_and_annotate(const Token& token) const = 0;
-    virtual std::vector<Token> encode_and_annotate(const std::vector<Token>& tokens) const;
+    virtual std::vector<std::string> encode(const std::string& str,
+                                            bool training = true) const = 0;
+    virtual std::vector<Token> encode_and_annotate(const Token& token,
+                                                   bool training = true) const = 0;
+    virtual std::vector<Token> encode_and_annotate(const std::vector<Token>& tokens,
+                                                   bool training = true) const;
 
     static void propagate_token_properties(const Token& token, std::vector<Token>& tokens);
   };

diff --git a/include/onmt/Tokenizer.h b/include/onmt/Tokenizer.h
@@ -77,15 +77,17 @@ namespace onmt
 
     void tokenize(const std::string& text,
                   std::vector<std::string>& words,
-                  std::vector<std::vector<std::string> >& features) const override;
-
+                  std::vector<std::vector<std::string> >& features,
+                  bool training = true) const override;
     void tokenize(const std::string& text,
                   std::vector<std::string>& words,
                   std::vector<std::vector<std::string> >& features,
-                  std::unordered_map<std::string, size_t>& alphabets) const override;
-
+                  std::unordered_map<std::string, size_t>& alphabets,
+                  bool training = true) const override;
     void tokenize(const std::string& text,
-                  std::vector<Token>& annotated_tokens) const;
+                  std::vector<Token>& annotated_tokens,
+                  bool training = true) const;
+
     Token annotate_token(const std::string& word) const;
     void annotate_tokens(const std::vector<std::string>& words,
                          const std::vector<std::vector<std::string>>& features,
@@ -127,11 +129,13 @@ namespace onmt
 
     void tokenize(const std::string& text,
                   std::vector<Token>& annotated_tokens,
-                  std::unordered_map<std::string, size_t>* alphabets) const;
+                  std::unordered_map<std::string, size_t>* alphabets,
+                  bool training) const;
     void tokenize(const std::string& text,
                   std::vector<std::string>& words,
                   std::vector<std::vector<std::string> >& features,
-                  std::unordered_map<std::string, size_t>* alphabets) const;
+                  std::unordered_map<std::string, size_t>* alphabets,
+                  bool training) const;
     std::string detokenize(const std::vector<Token>& tokens,
                            Ranges* ranges,
                            bool merge_ranges = false,

diff --git a/src/BPE.cc b/src/BPE.cc
@@ -156,7 +156,7 @@ namespace onmt
     return pieces;
   }
 
-  std::vector<std::string> BPE::encode(const std::string& str) const
+  std::vector<std::string> BPE::encode(const std::string& str, bool training) const
   {
     const auto chars_info = unicode::get_characters_info(str);
     std::vector<std::string> chars = get_initial_pieces(chars_info, _case_insensitive);
@@ -182,7 +182,7 @@ namespace onmt
         chars.push_back(_end_of_word);
     }
 
-    apply_merges(chars);
+    apply_merges(chars, training);
 
     if (_prefix && starts_with(chars.front(), _begin_of_word))
     {
@@ -233,9 +233,9 @@ namespace onmt
     return chars;
   }
 
-  std::vector<Token> BPE::encode_and_annotate(const Token& token) const
+  std::vector<Token> BPE::encode_and_annotate(const Token& token, bool training) const
   {
-    std::vector<std::string> encoded = encode(token.surface);
+    std::vector<std::string> encoded = encode(token.surface, training);
     std::vector<Token> tokens;
     tokens.reserve(encoded.size());
 
@@ -273,7 +273,7 @@ namespace onmt
       return std::numeric_limits<int>::max();
   }
 
-  void BPE::apply_merges(std::vector<std::string>& chars) const
+  void BPE::apply_merges(std::vector<std::string>& chars, bool training) const
   {
     // Compute score for all pairs.
     std::vector<int> scores;
@@ -289,7 +289,7 @@ namespace onmt
 
       for (size_t i = 0; i < scores.size(); ++i)
       {
-        if (_dropout != 0)
+        if (training && _dropout != 0)
         {
           std::uniform_real_distribution<float> dist;
           const float sample = dist(get_random_generator());

diff --git a/src/ITokenizer.cc b/src/ITokenizer.cc
@@ -140,15 +140,18 @@ namespace onmt
   void ITokenizer::tokenize(const std::string& text,
                             std::vector<std::string>& words,
                             std::vector<std::vector<std::string> >& features,
-                            std::unordered_map<std::string, size_t>&) const
+                            std::unordered_map<std::string, size_t>&,
+                            bool training) const
   {
-    tokenize(text, words, features);
+    tokenize(text, words, features, training);
   }
 
-  void ITokenizer::tokenize(const std::string& text, std::vector<std::string>& words) const
+  void ITokenizer::tokenize(const std::string& text,
+                            std::vector<std::string>& words,
+                            bool training) const
   {
     std::vector<std::vector<std::string> > features;
-    tokenize(text, words, features);
+    tokenize(text, words, features, training);
   }
 
   std::string ITokenizer::detokenize(const std::vector<std::string>& words) const
@@ -175,14 +178,15 @@ namespace onmt
                                    std::ostream& out,
                                    size_t num_threads,
                                    bool verbose,
+                                   bool training,
                                    size_t buffer_size) const
   {
     using Result = std::pair<std::vector<std::string>, std::vector<std::vector<std::string>>>;
-    auto function = [this](const std::string& text)
+    auto function = [this, training](const std::string& text)
                     {
                       std::vector<std::string> words;
                       std::vector<std::vector<std::string>> features;
-                      this->tokenize(text, words, features);
+                      this->tokenize(text, words, features, training);
                       return Result(std::move(words), std::move(features));
                     };
     auto writer = [](std::ostream& os, const Result& result)

diff --git a/src/SentencePiece.cc b/src/SentencePiece.cc
@@ -70,21 +70,21 @@ namespace onmt
     _alpha = alpha;
   }
 
-  std::vector<std::string> SentencePiece::encode(const std::string& str) const
+  std::vector<std::string> SentencePiece::encode(const std::string& str, bool training) const
   {
     std::vector<std::string> pieces;
 
-    if (_nbest_size != 0)
+    if (training && _nbest_size != 0)
       _processor->SampleEncode(str, _nbest_size, _alpha, &pieces);
     else
       _processor->Encode(str, &pieces);
 
     return pieces;
   }
 
-  std::vector<Token> SentencePiece::encode_and_annotate(const Token& token) const
+  std::vector<Token> SentencePiece::encode_and_annotate(const Token& token, bool training) const
   {
-    std::vector<std::string> pieces = encode(token.surface);
+    std::vector<std::string> pieces = encode(token.surface, training);
 
     // SentencePiece sometimes returns no pieces for a non empty input. In this case
     // we simply return the original token.

diff --git a/src/SpaceTokenizer.cc b/src/SpaceTokenizer.cc
@@ -15,7 +15,8 @@ namespace onmt
 
   void SpaceTokenizer::tokenize(const std::string& text,
                                 std::vector<std::string>& words,
-                                std::vector<std::vector<std::string> >& features) const
+                                std::vector<std::vector<std::string> >& features,
+                                bool) const
   {
     words.reserve(text.length());
 

diff --git a/src/SubwordEncoder.cc b/src/SubwordEncoder.cc
@@ -56,7 +56,8 @@ namespace onmt
   {
   }
 
-  std::vector<Token> SubwordEncoder::encode_and_annotate(const std::vector<Token>& tokens) const
+  std::vector<Token> SubwordEncoder::encode_and_annotate(const std::vector<Token>& tokens,
+                                                         bool training) const
   {
     std::vector<Token> segments;
     segments.reserve(tokens.size() * 2);
@@ -68,7 +69,7 @@ namespace onmt
         continue;
       }
 
-      std::vector<Token> sub_segments = encode_and_annotate(token);
+      std::vector<Token> sub_segments = encode_and_annotate(token, training);
       segments.insert(segments.end(),
                       std::make_move_iterator(sub_segments.begin()),
                       std::make_move_iterator(sub_segments.end()));