Skip to content

Commit

Permalink
Small fixes (#1392)
Browse files Browse the repository at this point in the history
Summary:
- Set default value of clip-norm back to 0.0 (disabled)
- Add comment explaining that we divide loss by log(2) to covert the base
- Fix `--zero-optimizer=os` (fixes #2811)
- Update requirements to PyTorch >= 1.5
- Fix bug in fixed LR schedule

Pull Request resolved: fairinternal/fairseq-py#1392

Reviewed By: alexeib

Differential Revision: D24714231

Pulled By: myleott

fbshipit-source-id: 63dc8cfc74683bbccbf05b44228014eb12ddbfc7
  • Loading branch information
Myle Ott authored and facebook-github-bot committed Nov 4, 2020
1 parent b120fbb commit dd52ed0
Show file tree
Hide file tree
Showing 9 changed files with 24 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more example

# Requirements and Installation

* [PyTorch](http://pytorch.org/) version >= 1.4.0
* [PyTorch](http://pytorch.org/) version >= 1.5.0
* Python version >= 3.6
* For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl)
* **To install fairseq** and develop locally:
Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ dataset:
optimization:
max_epoch: 0
max_update: 0
clip_norm: 25.0
clip_norm: 0.0
sentence_avg: false
update_freq: [ 1 ]
lr: [ 0.25 ]
Expand Down
1 change: 1 addition & 0 deletions fairseq/criterions/cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def reduce_metrics(logging_outputs) -> None:
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)

# we divide by log(2) to convert the loss from base e to base 2
metrics.log_scalar(
"loss", loss_sum / sample_size / math.log(2), sample_size, round=3
)
Expand Down
2 changes: 1 addition & 1 deletion fairseq/dataclass/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ class OptimizationConfig(FairseqDataclass):
},
)
clip_norm: float = field(
default=25.0, metadata={"help": "clip threshold of gradients"}
default=0.0, metadata={"help": "clip threshold of gradients"}
)
sentence_avg: bool = field(
default=False,
Expand Down
3 changes: 2 additions & 1 deletion fairseq/optim/fp16_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,8 @@ def zero_grad(self):
raise RuntimeError("self.fp32_params must be a tensor or dict")
else:
for p32 in self.fp32_params:
p32.grad.zero_()
if p32.grad:
p32.grad.zero_()
self._needs_sync = False

if self.scaler is not None:
Expand Down
4 changes: 4 additions & 0 deletions fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def load_state_dict(self, state_dict):
"""Load an LR scheduler state dict."""
self.best = state_dict["best"]

def step_begin_epoch(self, epoch):
"""Update the learning rate at the beginning of the given epoch."""
pass

def step(self, epoch, val_loss=None):
"""Update the learning rate at the end of the given epoch."""
if val_loss is not None:
Expand Down
9 changes: 4 additions & 5 deletions fairseq/optim/lr_scheduler/fixed_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def add_args(parser):
"""Add arguments to the parser for this LR scheduler."""
# fmt: off
parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
help='force annealing at specified epoch')
help='force annealing at specified epoch (epochs start at 1)')
parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
Expand All @@ -45,17 +45,16 @@ def get_next_lr(self, epoch):
lrs = self.args.lr
if self.args.force_anneal is None or epoch < self.args.force_anneal:
# use fixed LR schedule
next_lr = lrs[min(epoch, len(lrs) - 1)]
next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
else:
# annneal based on lr_shrink
next_lr = lrs[-1] * self.args.lr_shrink ** (
epoch + 1 - self.args.force_anneal
)
return next_lr

def step(self, epoch, val_loss=None):
"""Update the learning rate at the end of the given epoch."""
super().step(epoch, val_loss)
def step_begin_epoch(self, epoch):
"""Update the learning rate at the beginning of the given epoch."""
self.lr = self.get_next_lr(epoch)
self.optimizer.set_lr(self.warmup_factor * self.lr)
return self.optimizer.get_lr()
Expand Down
5 changes: 2 additions & 3 deletions fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,8 @@ def get_next_lr(self, epoch):
next_lr = self.optimizer.get_lr()
return next_lr

def step(self, epoch, val_loss=None):
"""Update the learning rate at the end of the given epoch."""
super().step(epoch, val_loss)
def step_begin_epoch(self, epoch):
"""Update the learning rate at the beginning of the given epoch."""
self.lr = self.get_next_lr(epoch)
self.optimizer.set_lr(self.warmup_factor * self.lr)
return self.optimizer.get_lr()
Expand Down
8 changes: 8 additions & 0 deletions fairseq/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ def begin_epoch(self, epoch):
"""Called at the beginning of each epoch."""
logger.info("begin training epoch {}".format(epoch))

self.lr_step_begin_epoch(epoch)

if self.quantizer is not None:
self.quantizer.begin_epoch(epoch)

Expand Down Expand Up @@ -782,6 +784,12 @@ def valid_step(self, sample, raise_oom=False):
def zero_grad(self):
self.optimizer.zero_grad()

def lr_step_begin_epoch(self, epoch):
"""Adjust the learning rate at the beginning of the epoch."""
self.lr_scheduler.step_begin_epoch(epoch)
# prefer updating the LR based on the number of steps
return self.lr_step_update()

def lr_step(self, epoch, val_loss=None):
"""Adjust the learning rate at the end of the epoch."""
self.lr_scheduler.step(epoch, val_loss)
Expand Down

0 comments on commit dd52ed0

Please sign in to comment.