Check out to native AdamW

yzhangcs · Jun 27, 2023 · 21afbee · 21afbee
1 parent 854c7ba
commit 21afbee
Showing 1 changed file with 1 addition and 3 deletions.
diff --git a/supar/parser.py b/supar/parser.py
@@ -16,7 +16,7 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.cuda.amp import GradScaler
-from torch.optim import Adam, Optimizer
+from torch.optim import Adam, AdamW, Optimizer
 from torch.optim.lr_scheduler import ExponentialLR, _LRScheduler
 
 import supar
@@ -501,8 +501,6 @@ def init_optimizer(self) -> Optimizer:
                              eps=self.args.get('eps', 1e-8),
                              weight_decay=self.args.get('weight_decay', 0))
         else:
-            # we found that Huggingface's AdamW is more robust and empirically better than the native implementation
-            from transformers import AdamW
             optimizer = AdamW(params=[{'params': p, 'lr': self.args.lr * (1 if n.startswith('encoder') else self.args.lr_rate)}
                                       for n, p in self.model.named_parameters()],
                               lr=self.args.lr,