Merge pull request #4 from ThilinaRajapakse/master

Update from source
ThilinaRajapakse · Jun 9, 2020 · f19d625 · f19d625
2 parents d473d87 + 19ecd79
commit f19d625
Show file tree

Hide file tree

Showing 12 changed files with 181 additions and 46 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -321,6 +321,25 @@
         "code",
         "doc"
       ]
+    },
+    {
+      "login": "guy-mor",
+      "name": "guy-mor",
+      "avatar_url": "https://avatars2.githubusercontent.com/u/44950985?v=4",
+      "profile": "https://github.com/guy-mor",
+      "contributions": [
+        "bug",
+        "code"
+      ]
+    },
+    {
+      "login": "cahya-wirawan",
+      "name": "Cahya Wirawan",
+      "avatar_url": "https://avatars1.githubusercontent.com/u/7669893?v=4",
+      "profile": "https://github.com/cahya-wirawan",
+      "contributions": [
+        "code"
+      ]
     }
   ],
   "contributorsPerLine": 7,

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.34.0] - 2020-06-09
+
+### Added
+
+- Added distributed training support for language model training. [@cahya-wirawan](https://github.com/cahya-wirawan)
+- Added multiprocessed decoding support for T5 models.
+
+
+## [0.33.2] - 2020-06-08
+
+### Fixed
+
+- Fixed bug in adding prefix space. Included longformer in list of models where prefix spaces are added. [@guy-mor](https://github.com/guy-mor)
+
+## [0.33.1] - 2020-06-08
+
+### Changed
+
+- Changed the tokenization logic of RoBERTa (and other models using GPT-2 tokenizer) so that a prefix space will be added to input sentences.
+
 ## [0.33.0] - 2020-06-08
 
 ### Added
@@ -811,7 +831,13 @@ Model checkpoint is now saved for all epochs again.
 
 - This CHANGELOG file to hopefully serve as an evolving example of a standardized open source project CHANGELOG.
 
-[0.33.0]: https://github.com/ThilinaRajapakse/simpletransformers/compare/e96aacd...HEAD
+[0.34.0]: https://github.com/ThilinaRajapakse/simpletransformers/compare/4789a1d...HEAD
+
+[0.33.2]: https://github.com/ThilinaRajapakse/simpletransformers/compare/bb83151...4789a1d
+
+[0.33.1]: https://github.com/ThilinaRajapakse/simpletransformers/compare/f40331b...bb83151
+
+[0.33.0]: https://github.com/ThilinaRajapakse/simpletransformers/compare/e96aacd...f40331b
 
 [0.32.3]: https://github.com/ThilinaRajapakse/simpletransformers/compare/f5cee79...e96aacd
 

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Downloads](https://pepy.tech/badge/simpletransformers)](https://pepy.tech/project/simpletransformers)
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-[![All Contributors](https://img.shields.io/badge/all_contributors-34-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-36-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 # Simple Transformers
@@ -3310,6 +3310,10 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
     <td align="center"><a href="https://www.linkedin.com/in/changlinz/"><img src="https://avatars0.githubusercontent.com/u/29640620?v=4" width="100px;" alt=""/><br /><sub><b>Changlin_NLP</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=alexucb" title="Code">💻</a></td>
     <td align="center"><a href="https://github.com/jpotoniec"><img src="https://avatars0.githubusercontent.com/u/11078342?v=4" width="100px;" alt=""/><br /><sub><b>jpotoniec</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=jpotoniec" title="Code">💻</a></td>
     <td align="center"><a href="https://github.com/fcggamou"><img src="https://avatars0.githubusercontent.com/u/20055856?v=4" width="100px;" alt=""/><br /><sub><b>fcggamou</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=fcggamou" title="Code">💻</a> <a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=fcggamou" title="Documentation">📖</a></td>
+    <td align="center"><a href="https://github.com/guy-mor"><img src="https://avatars2.githubusercontent.com/u/44950985?v=4" width="100px;" alt=""/><br /><sub><b>guy-mor</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/issues?q=author%3Aguy-mor" title="Bug reports">🐛</a> <a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=guy-mor" title="Code">💻</a></td>
+  </tr>
+  <tr>
+    <td align="center"><a href="https://github.com/cahya-wirawan"><img src="https://avatars1.githubusercontent.com/u/7669893?v=4" width="100px;" alt=""/><br /><sub><b>Cahya Wirawan</b></sub></a><br /><a href="https://github.com/ThilinaRajapakse/simpletransformers/commits?author=cahya-wirawan" title="Code">💻</a></td>
   </tr>
 </table>
 

diff --git a/docs/_docs/20-lm-specifics.md b/docs/_docs/20-lm-specifics.md
@@ -136,3 +136,16 @@ classification_model = ClassificationModel("electra", "outputs/checkpoint-1-epoc
 
 **Note:** Both `save_discriminator()` and `save_generator()` methods takes in an optional `output_dir` argument which specifies where the model should be saved.
 {: .notice--info}
+
+
+## Distributed Training
+
+Simple Transformers supports distributed language model training.
+
+**Tip:** You can find an example script [here](https://github.com/ThilinaRajapakse/simpletransformers/blob/master/examples/language_generation/train_new_lm.py).
+{: .notice--success}
+
+You can launch distributed training as shown below.
+```bash
+python -m torch.distributed.launch --nproc_per_node=4 train_new_lm.py
+```
diff --git a/examples/language_generation/train_new_lm.py b/examples/language_generation/train_new_lm.py
@@ -1,5 +1,6 @@
 from simpletransformers.language_modeling import LanguageModelingModel
 import logging
+import argparse
 
 
 logging.basicConfig(level=logging.INFO)
@@ -30,8 +31,18 @@
     "vocab_size": 10000,
     "output_dir": f"outputs/from_scratch_",
     "best_model_dir": f"outputs/from_scratch/best_model",
+    "fp16": False,
+    "local_rank": -1,
 }
 
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--local_rank", type=int, default=-1, help="Local rank. Necessary for using the torch.distributed.launch utility."
+)
+args = parser.parse_args()
+
+train_args["local_rank"] = args.local_rank
+
 train_file = f"data/train.txt"
 test_file = f"data/test.txt"
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="simpletransformers",
-    version="0.33.0",
+    version="0.34.0",
     author="Thilina Rajapakse",
     author_email="chaturangarajapakshe@gmail.com",
     description="An easy-to-use wrapper library for the Transformers library.",

diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py
@@ -836,7 +836,7 @@ def load_and_cache_examples(
                 sep_token=tokenizer.sep_token,
                 # RoBERTa uses an extra separator b/w pairs of sentences,
                 # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-                sep_token_extra=bool(args["model_type"] in ["roberta", "camembert", "xlmroberta"]),
+                sep_token_extra=bool(args["model_type"] in ["roberta", "camembert", "xlmroberta", "longformer"]),
                 # PAD on the left for XLNet
                 pad_on_left=bool(args["model_type"] in ["xlnet"]),
                 pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
@@ -848,6 +848,7 @@ def load_and_cache_examples(
                 sliding_window=args["sliding_window"],
                 flatten=not evaluate,
                 stride=args["stride"],
+                add_prefix_space=bool(args["model_type"] in ["roberta", "camembert", "xlmroberta", "longformer"]),
                 args=args,
             )
             if verbose and args["sliding_window"]:

diff --git a/simpletransformers/classification/classification_utils.py b/simpletransformers/classification/classification_utils.py
@@ -102,13 +102,21 @@ def convert_example_to_feature(
         sep_token_extra,
         multi_label,
         stride,
+        pad_token,
+        add_prefix_space,
     ) = example_row
 
-    tokens_a = tokenizer.tokenize(example.text_a)
+    if add_prefix_space and not example.text_a.startswith(" "):
+        tokens_a = tokenizer.tokenize(" " + example.text_a)
+    else:
+        tokens_a = tokenizer.tokenize(example.text_a)
 
     tokens_b = None
     if example.text_b:
-        tokens_b = tokenizer.tokenize(example.text_b)
+        if add_prefix_space and not example.text_b.startswith(" "):
+            tokens_b = tokenizer.tokenize(" " + example.tokens_b)
+        else:
+            tokens_b = tokenizer.tokenize(example.tokens_b)
         # Modifies `tokens_a` and `tokens_b` in place so that the total
         # length is less than the specified length.
         # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
@@ -215,6 +223,8 @@ def convert_example_to_feature_sliding_window(
         sep_token_extra,
         multi_label,
         stride,
+        pad_token,
+        add_prefix_space,
     ) = example_row
 
     if stride < 1:
@@ -223,7 +233,10 @@ def convert_example_to_feature_sliding_window(
     bucket_size = max_seq_length - (3 if sep_token_extra else 2)
     token_sets = []
 
-    tokens_a = tokenizer.tokenize(example.text_a)
+    if add_prefix_space and not example.text_a.startswith(" "):
+        tokens_a = tokenizer.tokenize(" " + example.text_a)
+    else:
+        tokens_a = tokenizer.tokenize(example.text_a)
 
     if len(tokens_a) > bucket_size:
         token_sets = [tokens_a[i : i + bucket_size] for i in range(0, len(tokens_a), stride)]
@@ -322,6 +335,7 @@ def convert_examples_to_features(
     sliding_window=False,
     flatten=False,
     stride=None,
+    add_prefix_space=False,
     args=None,
 ):
     """ Loads a data file into a list of `InputBatch`s
@@ -346,6 +360,8 @@ def convert_examples_to_features(
             sep_token_extra,
             multi_label,
             stride,
+            pad_token,
+            add_prefix_space,
         )
         for example in examples
     ]

diff --git a/simpletransformers/config/global_args.py b/simpletransformers/config/global_args.py
@@ -23,6 +23,7 @@
     "fp16_opt_level": "O1",
     "gradient_accumulation_steps": 1,
     "learning_rate": 4e-5,
+    "local_rank": -1,
     "logging_steps": 50,
     "manual_seed": None,
     "max_grad_norm": 1.0,