Small fixes (#1392)

Summary: - Set default value of clip-norm back to 0.0 (disabled) - Add comment explaining that we divide loss by log(2) to covert the base - Fix `--zero-optimizer=os` (fixes #2811) - Update requirements to PyTorch >= 1.5 - Fix bug in fixed LR schedule Pull Request resolved: fairinternal/fairseq-py#1392 Reviewed By: alexeib Differential Revision: D24714231 Pulled By: myleott fbshipit-source-id: 63dc8cfc74683bbccbf05b44228014eb12ddbfc7
freewym · Nov 4, 2020 · dd52ed0 · dd52ed0
1 parent b120fbb
commit dd52ed0
Show file tree

Hide file tree

Showing 9 changed files with 24 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more example
 
 # Requirements and Installation
 
-* [PyTorch](http://pytorch.org/) version >= 1.4.0
+* [PyTorch](http://pytorch.org/) version >= 1.5.0
 * Python version >= 3.6
 * For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
 * **To install fairseq** and develop locally:

diff --git a/config/config.yaml b/config/config.yaml
@@ -63,7 +63,7 @@ dataset:
 optimization:
     max_epoch: 0
     max_update: 0
-    clip_norm: 25.0
+    clip_norm: 0.0
     sentence_avg: false
     update_freq: [ 1 ]
     lr: [ 0.25 ]

diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py
@@ -64,6 +64,7 @@ def reduce_metrics(logging_outputs) -> None:
         ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
         sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
 
+        # we divide by log(2) to convert the loss from base e to base 2
         metrics.log_scalar(
             "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
         )

diff --git a/fairseq/dataclass/configs.py b/fairseq/dataclass/configs.py
@@ -434,7 +434,7 @@ class OptimizationConfig(FairseqDataclass):
         },
     )
     clip_norm: float = field(
-        default=25.0, metadata={"help": "clip threshold of gradients"}
+        default=0.0, metadata={"help": "clip threshold of gradients"}
     )
     sentence_avg: bool = field(
         default=False,

diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py
@@ -215,7 +215,8 @@ def zero_grad(self):
                 raise RuntimeError("self.fp32_params must be a tensor or dict")
         else:
             for p32 in self.fp32_params:
-                p32.grad.zero_()
+                if p32.grad:
+                    p32.grad.zero_()
         self._needs_sync = False
 
         if self.scaler is not None:

diff --git a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
@@ -34,6 +34,10 @@ def load_state_dict(self, state_dict):
         """Load an LR scheduler state dict."""
         self.best = state_dict["best"]
 
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
+        pass
+
     def step(self, epoch, val_loss=None):
         """Update the learning rate at the end of the given epoch."""
         if val_loss is not None:

diff --git a/fairseq/optim/lr_scheduler/fixed_schedule.py b/fairseq/optim/lr_scheduler/fixed_schedule.py
@@ -27,7 +27,7 @@ def add_args(parser):
         """Add arguments to the parser for this LR scheduler."""
         # fmt: off
         parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
-                            help='force annealing at specified epoch')
+                            help='force annealing at specified epoch (epochs start at 1)')
         parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
                             help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
         parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
@@ -45,17 +45,16 @@ def get_next_lr(self, epoch):
         lrs = self.args.lr
         if self.args.force_anneal is None or epoch < self.args.force_anneal:
             # use fixed LR schedule
-            next_lr = lrs[min(epoch, len(lrs) - 1)]
+            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
         else:
             # annneal based on lr_shrink
             next_lr = lrs[-1] * self.args.lr_shrink ** (
                 epoch + 1 - self.args.force_anneal
             )
         return next_lr
 
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
         self.lr = self.get_next_lr(epoch)
         self.optimizer.set_lr(self.warmup_factor * self.lr)
         return self.optimizer.get_lr()

diff --git a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
@@ -57,9 +57,8 @@ def get_next_lr(self, epoch):
             next_lr = self.optimizer.get_lr()
         return next_lr
 
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
+    def step_begin_epoch(self, epoch):
+        """Update the learning rate at the beginning of the given epoch."""
         self.lr = self.get_next_lr(epoch)
         self.optimizer.set_lr(self.warmup_factor * self.lr)
         return self.optimizer.get_lr()

diff --git a/fairseq/trainer.py b/fairseq/trainer.py
@@ -429,6 +429,8 @@ def begin_epoch(self, epoch):
         """Called at the beginning of each epoch."""
         logger.info("begin training epoch {}".format(epoch))
 
+        self.lr_step_begin_epoch(epoch)
+
         if self.quantizer is not None:
             self.quantizer.begin_epoch(epoch)
 
@@ -782,6 +784,12 @@ def valid_step(self, sample, raise_oom=False):
     def zero_grad(self):
         self.optimizer.zero_grad()
 
+    def lr_step_begin_epoch(self, epoch):
+        """Adjust the learning rate at the beginning of the epoch."""
+        self.lr_scheduler.step_begin_epoch(epoch)
+        # prefer updating the LR based on the number of steps
+        return self.lr_step_update()
+
     def lr_step(self, epoch, val_loss=None):
         """Adjust the learning rate at the end of the epoch."""
         self.lr_scheduler.step(epoch, val_loss)