Adding information about parameters and docstring

coqui-ai · Dec 10, 2022 · c2df9f3 · c2df9f3
1 parent ddefe34
commit c2df9f3
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 7 deletions.
diff --git a/TTS/tts/configs/overflow_config.py b/TTS/tts/configs/overflow_config.py
@@ -9,11 +9,107 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     """
     Define parameters for OverFlow model.
 
+    Example:
+
+        >>> from TTS.tts.configs.overflow_config import OverflowConfig
+        >>> config = OverflowConfig()
+
     Args:
-        BaseTTSConfig (_type_): _description_
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Overflow`.
+        run_eval_steps (int):
+            Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+        save_step (int):
+            Save local checkpoint every save_step steps. Defaults to 500.
+        plot_step (int):
+            Plot training stats on the logger every plot_step steps. Defaults to 1.
+        model_param_stats (bool):
+            Log model parameters stats on the logger dashboard. Defaults to False.
+        force_generate_statistics (bool):
+            Force generate mel normalization statistics. Defaults to False.
+        mel_statistics_parameter_path (str):
+            Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+            Defaults to None.
+        num_chars (int):
+            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+        state_per_phone (int):
+            Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+        encoder_in_out_features (int):
+            Channels of encoder input and character embedding tensors. Defaults to 512.
+        encoder_n_convolutions (int):
+            Number of convolution layers in the encoder. Defaults to 3.
+        out_channels (int):
+            Channels of the final model output. It must match the spectragram size. Defaults to 80.
+        ar_order (int):
+            Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        sampling_temp (float):
+            Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+        deterministic_transition (bool):
+            deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        duration_threshold (float):
+            Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+        use_grad_checkpointing (bool):
+            Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+        max_sampling_time (int):
+            Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dim (int):
+            Dimension of the Prenet. Defaults to 256.
+        prenet_n_layers (int):
+            Number of layers in the Prenet. Defaults to 2.
+        prenet_dropout (float):
+            Dropout rate of the Prenet. Defaults to 0.5.
+        prenet_dropout_at_inference (bool):
+            Use dropout at inference time. Defaults to False.
+        memory_rnn_dim (int):
+            Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+        outputnet_size (list[int]):
+            Size of the output network inside the neural HMM. Defaults to [1024].
+        flat_start_params (dict):
+            Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+            It will be recomputed when you pass the dataset.
+        std_floor (float):
+            Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+            It is called `variance flooring` in standard HMM literature.
+        hidden_channels_dec (int):
+            Number of base hidden channels used by the decoder WaveNet network. Defaults to 150.
+        kernel_size_dec (int):
+            Decoder kernel size. Defaults to 5
+        dilation_rate (int):
+            Rate to increase dilation by each layer in a decoder block. Defaults to 1.
+        num_flow_blocks_dec (int):
+            Number of decoder layers in each decoder block.  Defaults to 4.
+        dropout_p_dec (float):
+            Dropout rate of the decoder. Defaults to 0.05.
+        num_splits (int):
+            Number of split levels in inversible conv1x1 operation. Defaults to 4.
+        num_squeeze (int):
+            Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
+            'num_squeeze'. Defaults to 2.
+        sigmoid_scale (bool):
+            enable/disable sigmoid scaling in decoder. Defaults to False.
+        c_in_channels (int):
+            Unused parameter from GlowTTS's decoder. Defaults to 0.
+        optimizer (str):
+            Optimizer to use for training. Defaults to `adam`.
+        optimizer_params (dict):
+            Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to 40_000.
+        lr (float):
+            Learning rate. Defaults to 1e-3.
+        lr_scheduler (str):
+            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+            `TTS.utils.training`. Defaults to `None`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
     """
 
-    model: str = "OverFlow"
+    model: str = "Overflow"
 
     # Training and Checkpoint configs
     run_eval_steps: int = 100
@@ -49,7 +145,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     memory_rnn_dim: int = 1024
 
     ## Outputnet parameters
-    outputnet_size: List[int] = field(default_factory=lambda: [256, 256])
+    outputnet_size: List[int] = field(default_factory=lambda: [1024])
     flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
     std_floor: float = 0.01
 
@@ -79,11 +175,7 @@ class OverflowConfig(BaseTTSConfig):  # The classname has to be camel case
     # testing
     test_sentences: List[str] = field(
         default_factory=lambda: [
-            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             "Be a voice, not an echo.",
-            "I'm sorry Dave. I'm afraid I can't do that.",
-            "This cake is great. It's so delicious and moist.",
-            "Prior to November 22, 1963.",
         ]
     )
 

diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py
@@ -27,6 +27,22 @@ class NeuralHMM(nn.Module):
         approximation. We also describe how to combine ideas from classical and contemporary TTS for best results. The resulting
         example system is smaller and simpler than Tacotron 2, and learns to speak with fewer iterations and less data, whilst
         achieving comparable naturalness prior to the post-net. Our approach also allows easy control over speaking rate.
+
+    Args:
+        frame_channels (int): Output dimension to generate.
+        ar_order (int): Autoregressive order of the model. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        deterministic_transition (bool): deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        encoder_dim (int): Channels of encoder input and character embedding tensors. Defaults to 512.
+        prenet_type (str): `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the Prenet.
+        prenet_dim (int): Dimension of the Prenet.
+        prenet_n_layers (int): Number of layers in the Prenet.
+        prenet_dropout (float): Dropout probability of the Prenet.
+        prenet_dropout_at_inference (bool): If True, dropout is applied at inference time.
+        memory_rnn_dim (int): Size of the memory RNN to process output of prenet.
+        outputnet_size (List[int]): Size of the output network inside the neural HMM.
+        flat_start_params (dict): Parameters for the flat start initialization of the neural HMM.
+        std_floor (float): Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint.
+        use_grad_checkpointing (bool, optional): Use gradient checkpointing to save memory. Defaults to True.
     """
 
     def __init__(

diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
@@ -43,6 +43,19 @@ class Overflow(BaseTTS):
     whilst retaining the original advantages of neural HMMs. Audio examples and code
     are available at https://shivammehta25.github.io/OverFlow/.
 
+    Note:
+        - Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
+        of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
+        If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
+        `mel_statistics_parameter_path` accordingly.
+
+        - To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
+        This will significantly increase the memory usage.  This is because to compute
+        the actual data likelihood (not an approximation using MAS/Viterbi) we must use
+        all the states at the previous time step during the forward pass to decide the
+        probability distribution at the current step i.e the difference between the forward
+        algorithm and viterbi approximation.
+
     Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments.
     """