TensorSpeech
diff --git a/‎examples/fastspeech/train_fastspeech.py
Lines changed: 9 additions & 7 deletions b/‎examples/fastspeech/train_fastspeech.py
Lines changed: 9 additions & 7 deletions
diff --git a/‎examples/fastspeech2/train_fastspeech2.py
Lines changed: 9 additions & 8 deletions b/‎examples/fastspeech2/train_fastspeech2.py
Lines changed: 9 additions & 8 deletions
diff --git a/‎examples/fastspeech2_libritts/train_fastspeech2.py
Lines changed: 58 additions & 39 deletions b/‎examples/fastspeech2_libritts/train_fastspeech2.py
Lines changed: 58 additions & 39 deletions
diff --git a/‎examples/melgan.stft/train_melgan_stft.py
Lines changed: 13 additions & 10 deletions b/‎examples/melgan.stft/train_melgan_stft.py
Lines changed: 13 additions & 10 deletions
@@ -36,8 +36,7 @@
 from tensorflow_tts.models import TFFastSpeech
 from tensorflow_tts.optimizers import AdamWeightDecay, WarmUp
 from tensorflow_tts.trainers import Seq2SeqBasedTrainer
-from tensorflow_tts.utils import (calculate_2d_loss, calculate_3d_loss,
-                                  return_strategy)
+from tensorflow_tts.utils import calculate_2d_loss, calculate_3d_loss, return_strategy
 
 
 class FastSpeechTrainer(Seq2SeqBasedTrainer):
@@ -218,7 +217,7 @@ def main():
         default="",
         type=str,
         nargs="?",
-        help='pretrained checkpoint file to load weights from. Auto-skips non-matching layers',
+        help="pretrained checkpoint file to load weights from. Auto-skips non-matching layers",
     )
     args = parser.parse_args()
 
@@ -302,7 +301,9 @@ def main():
     ).create(
         is_shuffle=config["is_shuffle"],
         allow_cache=config["allow_cache"],
-        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
+        batch_size=config["batch_size"]
+        * STRATEGY.num_replicas_in_sync
+        * config["gradient_accumulation_steps"],
     )
 
     valid_dataset = CharactorDurationMelDataset(
@@ -335,11 +336,12 @@ def main():
         )
         fastspeech._build()
         fastspeech.summary()
-        
+
         if len(args.pretrained) > 1:
             fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True)
-            logging.info(f"Successfully loaded pretrained weight from {args.pretrained}.")
-
+            logging.info(
+                f"Successfully loaded pretrained weight from {args.pretrained}."
+            )
 
         # AdamW for fastspeech
         learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
 
@@ -33,15 +33,13 @@
 from tqdm import tqdm
 
 import tensorflow_tts
-from examples.fastspeech2.fastspeech2_dataset import \
-    CharactorDurationF0EnergyMelDataset
+from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
 from examples.fastspeech.train_fastspeech import FastSpeechTrainer
 from tensorflow_tts.configs import FastSpeech2Config
 from tensorflow_tts.models import TFFastSpeech2
 from tensorflow_tts.optimizers import AdamWeightDecay, WarmUp
 from tensorflow_tts.trainers import Seq2SeqBasedTrainer
-from tensorflow_tts.utils import (calculate_2d_loss, calculate_3d_loss,
-                                  return_strategy)
+from tensorflow_tts.utils import calculate_2d_loss, calculate_3d_loss, return_strategy
 
 
 class FastSpeech2Trainer(Seq2SeqBasedTrainer):
@@ -244,9 +242,8 @@ def main():
         default="",
         type=str,
         nargs="?",
-        help='pretrained weights .h5 file to load weights from. Auto-skips non-matching layers',
+        help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers",
     )
-    
 
     args = parser.parse_args()
 
@@ -330,7 +327,9 @@ def main():
     ).create(
         is_shuffle=config["is_shuffle"],
         allow_cache=config["allow_cache"],
-        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
+        batch_size=config["batch_size"]
+        * STRATEGY.num_replicas_in_sync
+        * config["gradient_accumulation_steps"],
     )
 
     valid_dataset = CharactorDurationF0EnergyMelDataset(
@@ -367,7 +366,9 @@ def main():
         fastspeech.summary()
         if len(args.pretrained) > 1:
             fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True)
-            logging.info(f"Successfully loaded pretrained weight from {args.pretrained}.")
+            logging.info(
+                f"Successfully loaded pretrained weight from {args.pretrained}."
+            )
 
         # AdamW for fastspeech
         learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
 
@@ -33,22 +33,33 @@
 import json
 
 import tensorflow_tts
-from examples.fastspeech2_libritts.fastspeech2_dataset import \
-    CharactorDurationF0EnergyMelDataset
+from examples.fastspeech2_libritts.fastspeech2_dataset import (
+    CharactorDurationF0EnergyMelDataset,
+)
 from tensorflow_tts.configs import FastSpeech2Config
 from tensorflow_tts.models import TFFastSpeech2
 from tensorflow_tts.optimizers import AdamWeightDecay, WarmUp
 from tensorflow_tts.trainers import Seq2SeqBasedTrainer
-from tensorflow_tts.utils import (calculate_2d_loss, calculate_3d_loss,
-                                  return_strategy, TFGriffinLim)
+from tensorflow_tts.utils import (
+    calculate_2d_loss,
+    calculate_3d_loss,
+    return_strategy,
+    TFGriffinLim,
+)
 
 
 class FastSpeech2Trainer(Seq2SeqBasedTrainer):
     """FastSpeech2 Trainer class based on FastSpeechTrainer."""
 
     def __init__(
-        self, config, strategy, steps=0, epochs=0, is_mixed_precision=False, stats_path: str = "",
-            dataset_config: str = ""
+        self,
+        config,
+        strategy,
+        steps=0,
+        epochs=0,
+        is_mixed_precision=False,
+        stats_path: str = "",
+        dataset_config: str = "",
     ):
         """Initialize trainer.
         Args:
@@ -78,7 +89,9 @@ def __init__(
         self.use_griffin = config.get("use_griffin", False)
         self.griffin_lim_tf = None
         if self.use_griffin:
-            logging.info(f"Load griff stats from {stats_path} and config from {dataset_config}")
+            logging.info(
+                f"Load griff stats from {stats_path} and config from {dataset_config}"
+            )
             self.griff_conf = yaml.load(open(dataset_config), Loader=yaml.Loader)
             self.prepare_grim(stats_path, self.griff_conf)
 
@@ -160,7 +173,9 @@ def generate_and_save_intermediate_result(self, batch):
 
         # check directory
         if self.use_griffin:
-            griff_dir_name = os.path.join(self.config["outdir"], f"predictions/{self.steps}_wav")
+            griff_dir_name = os.path.join(
+                self.config["outdir"], f"predictions/{self.steps}_wav"
+            )
             if not os.path.exists(griff_dir_name):
                 os.makedirs(griff_dir_name)
 
@@ -171,23 +186,31 @@ def generate_and_save_intermediate_result(self, batch):
         for idx, (mel_gt, mel_before, mel_after) in enumerate(
             zip(mel_gts, mels_before, mels_after), 0
         ):
-            
-            
+
             if self.use_griffin:
                 utt_id = utt_ids[idx]
-                grif_before = self.griffin_lim_tf(tf.reshape(mel_before, [-1, 80])[tf.newaxis, :], n_iter=32)
-                grif_after = self.griffin_lim_tf(tf.reshape(mel_after, [-1, 80])[tf.newaxis, :], n_iter=32)
-                grif_gt = self.griffin_lim_tf(tf.reshape(mel_gt, [-1, 80])[tf.newaxis, :], n_iter=32)
-                self.griffin_lim_tf.save_wav(grif_before, griff_dir_name, f"{utt_id}_before")
-                self.griffin_lim_tf.save_wav(grif_after, griff_dir_name, f"{utt_id}_after")
+                grif_before = self.griffin_lim_tf(
+                    tf.reshape(mel_before, [-1, 80])[tf.newaxis, :], n_iter=32
+                )
+                grif_after = self.griffin_lim_tf(
+                    tf.reshape(mel_after, [-1, 80])[tf.newaxis, :], n_iter=32
+                )
+                grif_gt = self.griffin_lim_tf(
+                    tf.reshape(mel_gt, [-1, 80])[tf.newaxis, :], n_iter=32
+                )
+                self.griffin_lim_tf.save_wav(
+                    grif_before, griff_dir_name, f"{utt_id}_before"
+                )
+                self.griffin_lim_tf.save_wav(
+                    grif_after, griff_dir_name, f"{utt_id}_after"
+                )
                 self.griffin_lim_tf.save_wav(grif_gt, griff_dir_name, f"{utt_id}_gt")
-            
+
             utt_id = utt_ids[idx]
             mel_gt = tf.reshape(mel_gt, (-1, 80)).numpy()  # [length, 80]
             mel_before = tf.reshape(mel_before, (-1, 80)).numpy()  # [length, 80]
             mel_after = tf.reshape(mel_after, (-1, 80)).numpy()  # [length, 80]
 
-
             # plit figure and save it
             figname = os.path.join(dirname, f"{utt_id}.png")
             fig = plt.figure(figsize=(10, 8))
@@ -229,10 +252,7 @@ def main():
         "--use-norm", default=1, type=int, help="usr norm-mels for train or raw."
     )
     parser.add_argument(
-        "--f0-stat",
-        default="./dump/stats_f0.npy",
-        type=str,
-        help="f0-stat path.",
+        "--f0-stat", default="./dump/stats_f0.npy", type=str, help="f0-stat path.",
     )
     parser.add_argument(
         "--energy-stat",
@@ -266,26 +286,20 @@ def main():
         help="using mixed precision for generator or not.",
     )
     parser.add_argument(
-        "--dataset_config",
-        default="preprocess/libritts_preprocess.yaml",
-        type=str,
+        "--dataset_config", default="preprocess/libritts_preprocess.yaml", type=str,
     )
     parser.add_argument(
-        "--dataset_stats",
-        default="dump/stats.npy",
-        type=str,
+        "--dataset_stats", default="dump/stats.npy", type=str,
     )
     parser.add_argument(
-        "--dataset_mapping",
-        default="dump/libritts_mapper.npy",
-        type=str,
+        "--dataset_mapping", default="dump/libritts_mapper.npy", type=str,
     )
     parser.add_argument(
         "--pretrained",
         default="",
         type=str,
         nargs="?",
-        help='pretrained weights .h5 file to load weights from. Auto-skips non-matching layers',
+        help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers",
     )
     args = parser.parse_args()
 
@@ -362,7 +376,9 @@ def main():
 
     # Check n_speakers matches number of speakers in speakers_map
     n_speakers = config["fastspeech2_params"]["n_speakers"]
-    assert n_speakers == len(speakers_map), f"Number of speakers in dataset does not match n_speakers in config"
+    assert n_speakers == len(
+        speakers_map
+    ), f"Number of speakers in dataset does not match n_speakers in config"
 
     # define train/valid dataset
     train_dataset = CharactorDurationF0EnergyMelDataset(
@@ -375,11 +391,13 @@ def main():
         f0_stat=args.f0_stat,
         energy_stat=args.energy_stat,
         mel_length_threshold=mel_length_threshold,
-        speakers_map=speakers_map
+        speakers_map=speakers_map,
     ).create(
         is_shuffle=config["is_shuffle"],
         allow_cache=config["allow_cache"],
-        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
+        batch_size=config["batch_size"]
+        * STRATEGY.num_replicas_in_sync
+        * config["gradient_accumulation_steps"],
     )
 
     valid_dataset = CharactorDurationF0EnergyMelDataset(
@@ -392,7 +410,7 @@ def main():
         f0_stat=args.f0_stat,
         energy_stat=args.energy_stat,
         mel_length_threshold=mel_length_threshold,
-        speakers_map=speakers_map
+        speakers_map=speakers_map,
     ).create(
         is_shuffle=config["is_shuffle"],
         allow_cache=config["allow_cache"],
@@ -407,7 +425,7 @@ def main():
         epochs=0,
         is_mixed_precision=args.mixed_precision,
         stats_path=args.dataset_stats,
-        dataset_config=args.dataset_config
+        dataset_config=args.dataset_config,
     )
 
     with STRATEGY.scope():
@@ -417,11 +435,12 @@ def main():
         )
         fastspeech._build()
         fastspeech.summary()
-        
+
         if len(args.pretrained) > 1:
             fastspeech.load_weights(args.pretrained, by_name=True, skip_mismatch=True)
-            logging.info(f"Successfully loaded pretrained weight from {args.pretrained}.")
-
+            logging.info(
+                f"Successfully loaded pretrained weight from {args.pretrained}."
+            )
 
         # AdamW for fastspeech
         learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
 
@@ -36,10 +36,8 @@
 from examples.melgan.audio_mel_dataset import AudioMelDataset
 from examples.melgan.train_melgan import MelganTrainer, collater
 from tensorflow_tts.losses import TFMultiResolutionSTFT
-from tensorflow_tts.models import (TFMelGANGenerator,
-                                   TFMelGANMultiScaleDiscriminator)
-from tensorflow_tts.utils import (calculate_2d_loss, calculate_3d_loss,
-                                  return_strategy)
+from tensorflow_tts.models import TFMelGANGenerator, TFMelGANMultiScaleDiscriminator
+from tensorflow_tts.utils import calculate_2d_loss, calculate_3d_loss, return_strategy
 
 
 class MultiSTFTMelganTrainer(MelganTrainer):
@@ -206,7 +204,7 @@ def main():
         default="",
         type=str,
         nargs="?",
-        help='path of .h5 melgan generator to load weights from',
+        help="path of .h5 melgan generator to load weights from",
     )
     args = parser.parse_args()
 
@@ -295,7 +293,9 @@ def main():
             hop_size=tf.constant(config["hop_size"], dtype=tf.int32),
         ),
         allow_cache=config["allow_cache"],
-        batch_size=config["batch_size"] * STRATEGY.num_replicas_in_sync,
+        batch_size=config["batch_size"]
+        * STRATEGY.num_replicas_in_sync
+        * config["gradient_accumulation_steps"],
     )
 
     valid_dataset = AudioMelDataset(
@@ -336,19 +336,22 @@ def main():
         )
 
         discriminator = TFMelGANMultiScaleDiscriminator(
-            MELGAN_CONFIG.MelGANDiscriminatorConfig(**config["melgan_discriminator_params"]),
+            MELGAN_CONFIG.MelGANDiscriminatorConfig(
+                **config["melgan_discriminator_params"]
+            ),
             name="melgan_discriminator",
         )
 
         # dummy input to build model.
         fake_mels = tf.random.uniform(shape=[1, 100, 80], dtype=tf.float32)
         y_hat = generator(fake_mels)
         discriminator(y_hat)
-        
+
         if len(args.pretrained) > 1:
             generator.load_weights(args.pretrained)
-            logging.info(f"Successfully loaded pretrained weight from {args.pretrained}.")
-
+            logging.info(
+                f"Successfully loaded pretrained weight from {args.pretrained}."
+            )
 
         generator.summary()
         discriminator.summary()