Adding overflow

coqui-ai · shivammehta25 · Nov 26, 2022 · Nov 27, 2022 · Nov 28, 2022 · Nov 30, 2022
commit 9984afae653c28e1d76bf2183e901e898786895b
diff --git a/TTS/tts/layers/neural_hmm/common_layers.py b/TTS/tts/layers/neural_hmm/common_layers.py
@@ -22,12 +22,18 @@ class Encoder(nn.Module):
         - output: (B, C_in, T)
     """
 
-    def __init__(self, state_per_phone, in_out_channels=512):
+    def __init__(
+        self, num_chars,
+        state_per_phone,
+        in_out_channels=512
+    ):
+
         super().__init__()
 
         self.state_per_phone = state_per_phone
         self.in_out_channels = in_out_channels
-
+
+        self.emb = nn.Embedding(num_chars, hidden_channels)
         self.convolutions = nn.ModuleList()
         for _ in range(3):
             self.convolutions.append(ConvBNBlock(in_out_channels, in_out_channels, 5, "relu"))
@@ -42,8 +48,8 @@ def __init__(self, state_per_phone, in_out_channels=512):
         self.rnn_state = None
 
     def forward(self, x, input_lengths):
-        b, _, T = x.shape
-        o = x
+        b, T = x.shape
+        o = self.emb(x).transpose(1, 2)
         for layer in self.convolutions:
             o = layer(o)
         o = o.transpose(1, 2)
@@ -73,7 +79,7 @@ class ParameterModel(nn.Module):
 
     def __init__(
         self,
-        parameternetwork: List[int],
+        outputnet_size: List[int],
         input_size: int,
         output_size: int,
         flat_start_params: dict,
@@ -83,9 +89,9 @@ def __init__(
         self.flat_start_params = flat_start_params
 
         self.layers = nn.ModuleList(
-            [Linear(inp, out) for inp, out in zip([input_size] + parameternetwork[:-1], parameternetwork)]
+            [Linear(inp, out) for inp, out in zip([input_size] + outputnet_size[:-1], outputnet_size)]
         )
-        last_layer = self._flat_start_output_layer(parameternetwork[-1], output_size, frame_channels)
+        last_layer = self._flat_start_output_layer(outputnet_size[-1], output_size, frame_channels)
         self.layers.append(last_layer)
 
     def _flat_start_output_layer(self, input_size, output_size, frame_channels):
@@ -115,7 +121,7 @@ def __init__(
         encoder_dim: int,
         memory_rnn_dim: int,
         frame_channels: int,
-        parameternetwork: List[int],
+        outputnet_size: List[int],
         flat_start_params: dict,
         std_floor: float = 1e-2,
     ):
@@ -131,7 +137,7 @@ def __init__(
         self._validate_parameters()
 
         self.parametermodel = ParameterModel(
-            parameternetwork=parameternetwork,
+            outputnet_size=outputnet_size,
             input_size=input_size,
             output_size=output_size,
             flat_start_params=flat_start_params,

diff --git a/TTS/tts/layers/neural_hmm/decoder.py b/TTS/tts/layers/neural_hmm/decoder.py
@@ -0,0 +1,69 @@
+import torch
+
+from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class Decoder(GlowDecoder):
+    """Uses glow decoder with some modifications.
+    ::
+
+        Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
+
+    Args:
+        in_channels (int): channels of input tensor.
+        hidden_channels (int): hidden decoder channels.
+        kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
+        dilation_rate (int): rate to increase dilation by each layer in a decoder block.
+        num_flow_blocks (int): number of decoder blocks.
+        num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
+        dropout_p (float): wavenet dropout rate.
+        sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        num_flow_blocks,
+        num_coupling_layers,
+        dropout_p=0.0,
+        num_splits=4,
+        num_squeeze=2,
+        sigmoid_scale=False,
+        c_in_channels=0
+    ):
+        super().__init__(
+            in_channels,
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            num_flow_blocks,
+            num_coupling_layers,
+            dropout_p,
+            num_splits,
+            num_squeeze,
+            sigmoid_scale,
+            c_in_channels
+        )
+
+    def forward(self, x, x_len, g=None, reverse=False):
+        """
+        Shapes:
+            - x:  :math:`[B, C, T]`
+            - x_len :math:`[B]`
+            - g: :math:`[B, C]`
+        """
+        x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
+        x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
+        x, logdet_tot = super().forward(x, x_mask, g, reverse)
+        return x, x_len, logdet_tot
+
+    def preprocess(self, y, y_lengths, y_max_length):
+        if y_max_length is not None:
+            y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
+            y = y[:, :, :y_max_length]
+        y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
+        return y, y_lengths, y_max_length
diff --git a/TTS/tts/layers/neural_hmm/hmm.py b/TTS/tts/layers/neural_hmm/hmm.py
@@ -39,7 +39,7 @@ def __init__(
         prenet_dropout: float,
         memory_rnn_dim: int,
         prenet_dropout_at_inference: bool,
-        parameternetwork: List[int],
+        outputnet_size: List[int],
         flat_start_params: dict,
         std_floor: float,
     ):
@@ -64,7 +64,9 @@ def __init__(
             bias=False,
         )
         self.memory_rnn = nn.LSTMCell(input_size=prenet_dim, hidden_size=memory_rnn_dim)
-        self.output_net = Outputnet(encoder_dim, memory_rnn_dim, frame_channels, parameternetwork, flat_start_params, std_floor)
+        self.output_net = Outputnet(
+            encoder_dim, memory_rnn_dim, frame_channels, outputnet_size, flat_start_params, std_floor
+        )
         self.register_buffer("go_tokens", torch.zeros(ar_order, 1))
 
     def forward(self, inputs, inputs_len, mels, mel_lens):

diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
@@ -0,0 +1,106 @@
+import torch
+import torch.nn as nn
+
+from TTS.tts.layers.glow_tts.decoder import Decoder
+from TTS.tts.layers.neural_hmm.common_layers import Encoder
+from TTS.tts.layers.neural_hmm.hmm import HMM
+from TTS.tts.models.base_tts import BaseTTS
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+
+
+class OverFlow(BaseTTS):
+    """OverFlow TTS model.
+
+    Paper::
+        https://arxiv.org/abs/2211.06892
+
+    Paper abstract::
+        Neural HMMs are a type of neural transducer recently proposed for
+    sequence-to-sequence modelling in text-to-speech. They combine the best features
+    of classic statistical speech synthesis and modern neural TTS, requiring less
+    data and fewer training updates, and are less prone to gibberish output caused
+    by neural attention failures. In this paper, we combine neural HMM TTS with
+    normalising flows for describing the highly non-Gaussian distribution of speech
+    acoustics. The result is a powerful, fully probabilistic model of durations and
+    acoustics that can be trained using exact maximum likelihood. Compared to
+    dominant flow-based acoustic models, our approach integrates autoregression for
+    improved modelling of long-range dependences such as utterance-level prosody.
+    Experiments show that a system based on our proposal gives more accurate
+    pronunciations and better subjective speech quality than comparable methods,
+    whilst retaining the original advantages of neural HMMs. Audio examples and code
+    are available at https://shivammehta25.github.io/OverFlow/.
+
+    Check :class:`TTS.tts.configs.overflow.OverFlowConfig` for class arguments.
+    """
+
+    def __init__(
+        self, config: "OverFlowConfig",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,        
+    ):
+        super().__init__(config, ap, tokenizer, speaker_manager)
+
+        # pass all config fields to `self`
+        # for fewer code change
+        self.config = config
+        for key in config:
+            setattr(self, key, config[key])
+
+        self.decoder_output_dim = config.out_channels
+
+        self.encoder = Encoder(self.num_char,config.state_per_phone, config.encoder_in_features)
+        self.hmm = HMM(
+            self.out_channels,
+            self.ar_order,
+            self.encoder_dim,
+            self.prenet_type,
+            self.prenet_dim,
+            self.prenet_dropout,
+            self.memory_rnn_dim,
+            self.prenet_dropout_at_inference,
+            self.outputnet_size,
+            self.flat_start_params,
+            self.std_floor
+        )
+
+        self.decoder = Decoder(
+            self.out_channels,
+            self.hidden_channels_dec,
+            self.kernel_size_dec,
+            self.dilation_rate,
+            self.num_flow_blocks_dec,
+            self.num_block_layers,
+            dropout_p=self.dropout_p_dec,
+            num_splits=self.num_splits,
+            num_squeeze=self.num_squeeze,
+            sigmoid_scale=self.sigmoid_scale,
+            c_in_channels=self.c_in_channels
+        )
+
+
+    def forward(
+        self, text, text_len, mels, mel_len
+    ):
+        """
+        Forward pass for training and computing the log likelihood of a given batch.
+
+        Shapes:
+            Shapes:
+            text: :math:`[B, T_in]`
+            text_lengths: :math:`[B]`
+            mel_specs: :math:`[B, T_out, C]`
+            mel_lengths: :math:`[B]`
+        """
+        outputs = {
+            "log_alpha": None
+        }
+
+        encoder_outputs, text_lengths = self.encoder(text, text_lengths)
+        z, z_lengths, log_det = self.decoder(mels, mel_len)
+        log_probs = self.hmm(encoder_outputs, text_lengths, z, z_lengths)
+
+
+
+
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
@@ -282,6 +282,5 @@ def logsumexp(x, dim):
 
     m, _ = x.max(dim=dim)
     mask = m == -float("inf")
-
     s = (x - m.masked_fill_(mask, 0).unsqueeze(dim=dim)).exp().sum(dim=dim)
     return s.masked_fill_(mask, 1).log() + m.masked_fill_(mask, -float("inf"))
diff --git a/tests/tts_tests/test_helpers.py b/tests/tts_tests/test_helpers.py
@@ -1,6 +1,13 @@
 import torch as T
 
-from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask
+from TTS.tts.utils.helpers import (
+    average_over_durations,
+    generate_path,
+    logsumexp,
+    rand_segments,
+    segment,
+    sequence_mask,
+)
 
 
 def average_over_durations_test():  # pylint: disable=no-self-use
@@ -86,3 +93,13 @@ def generate_path_test():
             assert all(path[b, t, :current_idx] == 0.0)
             assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0)
             current_idx += durations[b, t].item()
+
+def logsumexp_test():
+    a = T.randn(10) # random numbers
+    assert T.eq(T.logsumexp(a, dim=0), logsumexp(a, dim=0)).all()
+
+    a = T.zeros(10) # all zeros
+    assert T.eq(T.logsumexp(a, dim=0), logsumexp(a, dim=0)).all()
+
+    a = T.ones(10) # all ones
+    assert T.eq(T.logsumexp(a, dim=0), logsumexp(a, dim=0)).all()
diff --git a/tests/tts_tests/test_overflow.py b/tests/tts_tests/test_overflow.py