huggingface · sshleifer · Jun 17, 2020 · May 18, 2020 · May 18, 2020 · May 18, 2020
diff --git a/examples/lightning_base.py b/examples/lightning_base.py
@@ -2,6 +2,8 @@
 import logging
 import os
 import random
+from pathlib import Path
+from typing import Any, Dict
 
 import numpy as np
 import pytorch_lightning as pl
@@ -13,10 +15,13 @@
     AutoModel,
     AutoModelForPreTraining,
     AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
     AutoModelWithLMHead,
     AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedTokenizer,
     get_linear_schedule_with_warmup,
 )
 
@@ -31,40 +36,68 @@
     "pretraining": AutoModelForPreTraining,
     "token-classification": AutoModelForTokenClassification,
     "language-modeling": AutoModelWithLMHead,
+    "summarization": AutoModelForSeq2SeqLM,
+    "translation": AutoModelForSeq2SeqLM,
 }
 
 
 def set_seed(args: argparse.Namespace):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
+    if args.gpus > 0:
         torch.cuda.manual_seed_all(args.seed)
 
 
 class BaseTransformer(pl.LightningModule):
-    def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", **config_kwargs):
+    def __init__(
+        self,
+        hparams: argparse.Namespace,
+        num_labels=None,
+        mode="base",
+        config=None,
+        tokenizer=None,
+        model=None,
+        **config_kwargs
+    ):
         "Initialize a model."
 
         super().__init__()
         self.hparams = hparams
+        self.step_count = 0
+        self.tfmr_ckpts = {}
+        self.output_dir = Path(self.hparams.output_dir)
         cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        self.config = AutoConfig.from_pretrained(
-            self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-            **({"num_labels": num_labels} if num_labels is not None else {}),
-            cache_dir=cache_dir,
-            **config_kwargs,
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-            cache_dir=cache_dir,
-        )
-        self.model = MODEL_MODES[mode].from_pretrained(
-            self.hparams.model_name_or_path,
-            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-            config=self.config,
-            cache_dir=cache_dir,
-        )
+        if config is None:
+            self.config = AutoConfig.from_pretrained(
+                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+                **({"num_labels": num_labels} if num_labels is not None else {}),
+                cache_dir=cache_dir,
+                **config_kwargs,
+            )
+        else:
+            self.config: PretrainedConfig = config
+        if tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.tokenizer: PreTrainedTokenizer = tokenizer
+        if model is None:
+            self.model_type = MODEL_MODES[mode]
+            self.model = self.model_type.from_pretrained(
+                self.hparams.model_name_or_path,
+                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+                config=self.config,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.model_type = None
+            self.model = model
+
+    def load_hf_checkpoint(self, *args, **kwargs):
+        self.model = self.model_type.from_pretrained(*args, **kwargs)
 
     def is_logger(self):
         return self.trainer.proc_rank <= 0
@@ -138,6 +171,15 @@ def _feature_file(self, mode):
             ),
         )
 
+    @pl.utilities.rank_zero_only
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        save_path = self.output_dir.joinpath("best_tfmr")
+        save_path.mkdir(exist_ok=True)
+        self.model.config.save_step = self.step_count
+        self.model.save_pretrained(save_path)
+        self.tokenizer.save_pretrained(save_path)
+        self.tfmr_ckpts[self.step_count] = save_path
+
     @staticmethod
     def add_model_specific_args(parser, root_dir):
         parser.add_argument(
@@ -152,7 +194,7 @@ def add_model_specific_args(parser, root_dir):
         )
         parser.add_argument(
             "--tokenizer_name",
-            default="",
+            default=None,
             type=str,
             help="Pretrained tokenizer name or path if not the same as model_name",
         )
@@ -165,7 +207,7 @@ def add_model_specific_args(parser, root_dir):
         parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
         parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
         parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+        parser.add_argument("--warmup_steps", default=500, type=int, help="Linear warmup over warmup_steps.")
         parser.add_argument(
             "--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform."
         )
@@ -199,7 +241,8 @@ def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
                         writer.write("{} = {}\n".format(key, str(metrics[key])))
 
 
-def add_generic_args(parser, root_dir):
+def add_generic_args(parser, root_dir) -> None:
+    #  TODO(SS): allow all pl args? parser = pl.Trainer.add_argparse_args(parser)
     parser.add_argument(
         "--output_dir",
         default=None,
@@ -221,8 +264,8 @@ def add_generic_args(parser, root_dir):
         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
         "See details at https://nvidia.github.io/apex/amp.html",
     )
-
-    parser.add_argument("--n_gpu", type=int, default=1)
+    parser.add_argument("--fast_dev_run", action="store_true")
+    parser.add_argument("--gpus", type=int, default=1)
     parser.add_argument("--n_tpu_cores", type=int, default=0)
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
@@ -235,28 +278,32 @@ def add_generic_args(parser, root_dir):
     )
 
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-
-def generic_train(model: BaseTransformer, args: argparse.Namespace):
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+    parser.add_argument("--val_check_interval", default=1.0, type=float)
+
+
+def generic_train(
+    model: BaseTransformer,
+    args: argparse.Namespace,
+    early_stopping_callback=False,
+    logger=True,  # can pass WandbLogger() here
+    extra_callbacks=[],
+    checkpoint_callback=None,
+    logging_callback=None,
+    **extra_train_kwargs
+):
     # init model
     set_seed(args)
+    odir = Path(model.hparams.output_dir)
+    odir.mkdir(exist_ok=True)
+    if checkpoint_callback is None:
+        checkpoint_callback = pl.callbacks.ModelCheckpoint(
+            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
+        )
+    if logging_callback is None:
+        logging_callback = LoggingCallback()
 
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-
-    checkpoint_callback = pl.callbacks.ModelCheckpoint(
-        filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
-    )
-
-    train_params = dict(
-        accumulate_grad_batches=args.gradient_accumulation_steps,
-        gpus=args.n_gpu,
-        max_epochs=args.num_train_epochs,
-        early_stop_callback=False,
-        gradient_clip_val=args.max_grad_norm,
-        checkpoint_callback=checkpoint_callback,
-        callbacks=[LoggingCallback()],
-    )
+    train_params = {}
 
     if args.fp16:
         train_params["use_amp"] = args.fp16
@@ -269,12 +316,27 @@ def generic_train(model: BaseTransformer, args: argparse.Namespace):
         train_params["num_tpu_cores"] = args.n_tpu_cores
         train_params["gpus"] = 0
 
-    if args.n_gpu > 1:
+    if args.gpus > 1:
         train_params["distributed_backend"] = "ddp"
 
-    trainer = pl.Trainer(**train_params)
+    trainer = pl.Trainer(
+        logger=logger,
+        accumulate_grad_batches=args.gradient_accumulation_steps,
+        gpus=args.gpus,
+        max_epochs=args.num_train_epochs,
+        early_stop_callback=early_stopping_callback,
+        gradient_clip_val=args.max_grad_norm,
+        checkpoint_callback=checkpoint_callback,
+        callbacks=[logging_callback] + extra_callbacks,
+        fast_dev_run=args.fast_dev_run,
+        val_check_interval=args.val_check_interval,
+        weights_summary=None,
+        resume_from_checkpoint=args.resume_from_checkpoint,
+        **train_params,
+    )
 
     if args.do_train:
         trainer.fit(model)
-
+    trainer.logger.log_hyperparams(args)
+    trainer.logger.save()
     return trainer
diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -5,5 +5,6 @@ psutil
 sacrebleu
 rouge-score
 tensorflow_datasets
-pytorch-lightning==0.7.3  # April 10, 2020 release
+pytorch-lightning==0.7.6
 matplotlib
+git-python==1.0.3
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
@@ -1,47 +1,70 @@
-### Get CNN Data
-To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+### Data
 
+CNN/DailyMail data
 ```bash
+cd examples/summarization
 wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz
 tar -xzvf cnn_dm.tgz
+export CNN_DIR=${PWD}/cnn_dm
 ```
 
 this should make a directory called cnn_dm/ with files like `test.source`.
 To use your own data, copy that files format. Each article to be summarized is on its own line.
 
+XSUM Data:
+```bash
+cd examples/summarization
+wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz
+tar -xzvf xsum.tar.gz
+export XSUM_DIR=${PWD}/xsum
+```
+
+
 ### Evaluation
 
 To create summaries for each article in dataset, run:
 ```bash
-python evaluate_cnn.py <path_to_test.source> test_generations.txt <model-name>  --score_path rouge_scores.txt
+python run_eval.py <path_to_test.source> test_generations.txt <model-name>  --score_path rouge_scores.txt
 ```
-The default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
+The default batch size, 4, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
+
 
 ### Training
-Run/modify `finetune_bart.sh` or `finetune_t5.sh`
+Run/modify `finetune.sh`
 
-### Stanford CoreNLP Setup
-```
-ptb_tokenize () {
-    cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
-}
-
-sudo apt install openjdk-8-jre-headless
-sudo apt-get install ant
-wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
-unzip stanford-corenlp-full-2018-10-05.zip
-cd stanford-corenlp-full-2018-10-05
-export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar
+The following command should work on a 16GB GPU:
+```bash
+export me=`git config user.name`
+./finetune.sh \
+    --data_dir $XSUM_DIR \
+    --train_batch_size=1 \
+    --eval_batch_size=1 \
+    --output_dir="$me"_xsum_results \
+    --num_train_epochs 1
 ```
-Then run `ptb_tokenize` on `test.target` and your generated hypotheses.
-### Rouge Setup
-Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge).
-I also needed to run `sudo apt-get install libxml-parser-perl`
-
-```python
-from files2rouge import files2rouge
-from files2rouge import settings
-files2rouge.run(<path_to_tokenized_hypo>,
-                <path_to_tokenized_target>,
-               saveto='rouge_output.txt')
+
+Tips:
+- 1 epoch at batch size 1 for bart-large takes 24 hours, requires 13GB GPU RAM with fp16 on an NVIDIA-V100. 
+- try `bart-base`, `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr/epoch with bs=8, see below)
+- `fp16_opt_level=O1` (the default works best).
+- If you are finetuning on your own dataset, start from `bart-large-cnn` if you want long summaries and `bart-large-xsum` if you want short summaries.
+(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
+- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
+Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
+- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
+- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.    
+
+### XSUM Shared Task
+Compare XSUM results with others by using `--logger wandb_shared`. This requires `wandb` registration.
+Here is an example command
+```bash
+export me=`git config user.name`
+./finetune.sh \
+    --data_dir $XSUM_DIR \
+    --output_dir "$me"_xsum_frozen_embs \
+    --logger wandb_shared \
+    --train_batch_size 16 --eval_batch_size 16 --freeze_embeds --freeze_encoder \
+    --num_train_epochs 6
 ```
+
+Results can be viewed [here](https://app.wandb.ai/sshleifer/hf_summarization/table?workspace=user-)