Fixed missing import in train_tacotron.py, now saving optimizer state

TheButlah · TheButlah · commit 6422d250055e · 2019-07-18T14:36:02.000-07:00
diff --git a/models/fatchord_version.py b/models/fatchord_version.py
@@ -388,9 +388,12 @@ def xfade_and_unfold(self, y, target, overlap):
     def get_step(self):
         return self.step.data.item()
 
-    def checkpoint(self, path):
+    def checkpoint(self, path, optimizer):
+        # Optimizer can be given as an argument because checkpoint function is
+        # only useful in context of already existing training process.
         k_steps = self.get_step() // 1000
         self.save(f'{path}/checkpoint_{k_steps}k_steps.pyt')
+        torch.save(optimizer.get_state(), f'{path}/checkpoint_{k_steps}k_steps_optim.pyt')
 
     def log(self, path, msg):
         with open(path, 'a') as f:
@@ -405,10 +408,14 @@ def restore(self, path):
             self.load(path)
 
     def load(self, path, device='cpu'):
-        # because PyTorch places on CPU by default, we follow those semantics by using CPU as default.
+        # because PyTorch places on CPU by default, we follow those semantics by
+        # using CPU as default.
         self.load_state_dict(torch.load(path, map_location=device), strict=False)
 
     def save(self, path):
+        # No optimizer argument because saving a model should not include data
+        # only relevant in the training process - it should only be properties
+        # of the model itself. Let caller take care of saving optimzier state.
         torch.save(self.state_dict(), path)
 
     def num_params(self, print_out=True):
diff --git a/models/tacotron.py b/models/tacotron.py
@@ -432,9 +432,12 @@ def reset_step(self):
         # assignment to parameters or buffers is overloaded, updates internal dict entry
         self.step = torch.zeros(1, dtype=torch.long)
 
-    def checkpoint(self, path):
+    def checkpoint(self, path, optimizer):
+        # Optimizer can be given as an argument because checkpoint function is
+        # only useful in context of already existing training process.
         k_steps = self.get_step() // 1000
         self.save(f'{path}/checkpoint_{k_steps}k_steps.pyt')
+        torch.save(optimizer.get_state(), f'{path}/checkpoint_{k_steps}k_steps_optim.pyt')
 
     def log(self, path, msg):
         with open(path, 'a') as f:
@@ -454,6 +457,9 @@ def load(self, path, device='cpu'):
         self.load_state_dict(torch.load(path, map_location=device), strict=False)
 
     def save(self, path):
+        # No optimizer argument because saving a model should not include data
+        # only relevant in the training process - it should only be properties
+        # of the model itself. Let caller take care of saving optimzier state.
         torch.save(self.state_dict(), path)
 
     def num_params(self, print_out=True):
diff --git a/train_tacotron.py b/train_tacotron.py
@@ -8,6 +8,8 @@
 from utils.paths import Paths
 from models.tacotron import Tacotron
 import argparse
+from utils import data_parallel_workaround
+import os
 
 
 def np_now(x): return x.detach().cpu().numpy()
@@ -61,7 +63,7 @@ def tts_train_loop(model, optimizer, train_set, lr, train_steps, attn_example):
             avg_loss = running_loss / i
 
             if step % hp.tts_checkpoint_every == 0:
-                model.checkpoint(paths.tts_checkpoints)
+                model.checkpoint(paths.tts_checkpoints, optimizer)
 
             if attn_example in ids:
                 idx = ids.index(attn_example)
@@ -71,6 +73,9 @@ def tts_train_loop(model, optimizer, train_set, lr, train_steps, attn_example):
             msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
             stream(msg)
 
+        # Must save latest optimizer state to ensure that resuming training
+        # doesn't produce artifacts
+        torch.save(optimizer.state_dict(), paths.tts_latest_optim)
         model.save(paths.tts_latest_weights)
         model.log(paths.tts_log, msg)
         print(' ')
@@ -146,7 +151,10 @@ def create_gta_features(model, train_set, save_path):
 
     # model.set_r(hp.tts_r)
 
-    optimiser = optim.Adam(model.parameters())
+    optimizer = optim.Adam(model.parameters())
+    if os.path.isfile(paths.tts_latest_optim):
+        print(f'Loading Optimizer State: "{paths.tts_latest_optim}"')
+        optimizer.load_state_dict(torch.load(paths.tts_latest_optim))
 
     current_step = model.get_step()
 
@@ -169,7 +177,7 @@ def create_gta_features(model, train_set, save_path):
                               ('Learning Rate', lr),
                               ('Outputs/Step (r)', model.get_r())])
 
-                tts_train_loop(model, optimiser, train_set, lr, training_steps, attn_example)
+                tts_train_loop(model, optimizer, train_set, lr, training_steps, attn_example)
 
         print('Training Complete.')
         print('To continue training increase tts_total_steps in hparams.py or use --force_train\n')
diff --git a/train_wavernn.py b/train_wavernn.py
@@ -12,11 +12,12 @@
 from utils.paths import Paths
 import argparse
 from utils import data_parallel_workaround
+import os
 
 
-def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr, total_steps, device):
+def voc_train_loop(model, loss_func, optimizer, train_set, test_set, lr, total_steps, device):
 
-    for p in optimiser.param_groups: p['lr'] = lr
+    for p in optimizer.param_groups: p['lr'] = lr
 
     total_iters = len(train_set)
     epochs = (total_steps - model.get_step()) // total_iters + 1
@@ -46,13 +47,13 @@ def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr, total_s
 
             loss = loss_func(y_hat, y)
 
-            optimiser.zero_grad()
+            optimizer.zero_grad()
             loss.backward()
             if hp.voc_clip_grad_norm is not None:
                 grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.voc_clip_grad_norm)
                 if np.isnan(grad_norm):
                     print('grad_norm was NaN!')
-            optimiser.step()
+            optimizer.step()
             running_loss += loss.item()
 
             speed = i / (time.time() - start)
@@ -64,11 +65,14 @@ def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr, total_s
             if step % hp.voc_checkpoint_every == 0:
                 gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
                             hp.voc_target, hp.voc_overlap, paths.voc_output)
-                model.checkpoint(paths.voc_checkpoints)
+                model.checkpoint(paths.voc_checkpoints, optimizer)
 
             msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
             stream(msg)
 
+        # Must save latest optimizer state to ensure that resuming training
+        # doesn't produce artifacts
+        torch.save(optimizer.state_dict(), paths.tts_latest_optim)
         model.save(paths.voc_latest_weights)
         model.log(paths.voc_log, msg)
         print(' ')
@@ -123,7 +127,10 @@ def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr, total_s
 
     voc_model.restore(paths.voc_latest_weights)
 
-    optimiser = optim.Adam(voc_model.parameters())
+    optimizer = optim.Adam(voc_model.parameters())
+    if os.path.isfile(paths.voc_latest_optim):
+        print(f'Loading Optimizer State: "{paths.voc_latest_optim}"')
+        optimizer.load_state_dict(torch.load(paths.voc_latest_optim))
 
     train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta)
 
@@ -137,7 +144,7 @@ def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr, total_s
 
     loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss
 
-    voc_train_loop(voc_model, loss_func, optimiser, train_set, test_set, lr, total_steps, device)
+    voc_train_loop(voc_model, loss_func, optimizer, train_set, test_set, lr, total_steps, device)
 
     print('Training Complete.')
     print('To continue training increase voc_total_steps in hparams.py or use --force_train')
diff --git a/utils/paths.py b/utils/paths.py
@@ -11,12 +11,14 @@ def __init__(self, data_path, voc_id, tts_id):
         # WaveRNN/Vocoder Paths
         self.voc_checkpoints = f'checkpoints/{voc_id}.wavernn/'
         self.voc_latest_weights = f'{self.voc_checkpoints}latest_weights.pyt'
+        self.voc_latest_optim = f'{self.voc_checkpoints}latest_optim.pyt'
         self.voc_output = f'model_outputs/{voc_id}.wavernn/'
         self.voc_step = f'{self.voc_checkpoints}/step.npy'
         self.voc_log = f'{self.voc_checkpoints}log.txt'
         # Tactron/TTS Paths
         self.tts_checkpoints = f'checkpoints/{tts_id}.tacotron/'
         self.tts_latest_weights = f'{self.tts_checkpoints}latest_weights.pyt'
+        self.tts_latest_optim = f'{self.tts_checkpoints}latest_optim.pyt'
         self.tts_output = f'model_outputs/{tts_id}.tts/'
         self.tts_step = f'{self.tts_checkpoints}/step.npy'
         self.tts_log = f'{self.tts_checkpoints}log.txt'