Add Delightful-TTS implementation (coqui-ai#2095)

* add configs * Update config file * Add model configs * Add model layers * Add layer files * Add layer modules * change config names * Add emotion manager * fIX missing ap bug * Fix missing ap bug * Add base TTS e2e class * Fix wrong variable name in load_tts_samples * Add training script * Remove range predictor and gaussian upsampling * Add helper function * Add vctk recipe * Add conformer docs * Fix linting in conformer.py * Add Docs * remove duplicate import * refactor args * Fix bugs * Removew emotion embedding * remove unused arg * Remove emotion embedding arg * Remove emotion embedding arg * fix style issues * Fix bugs * Fix bugs * Add unittests * make style * fix formatter bug * fix test * Add pyworld compute pitch func * Update requirments.txt * Fix dataset Bug * Chnge layer norm to instance norm * Add missing import * Remove emotions.py * remove ssim loss * Add init layers func to aligner * refactor model layers * remove audio_config arg * Rename loss func * Rename to delightful-tts * Rename loss func * Remove unused modules * refactor imports * replace audio config with audio processor * Add change sample rate option * remove broken resample func * update recipe * fix style, add config docs * fix tests and multispeaker embd dim * remove pyworld * Make style and fix inference * Split tts tests * Fixup * Fixup * Fixup * Add argument names * Set "random" speaker in the model Tortoise/Bark * Use a diff f0_cache path for delightfull tts * Fix delightful speaker handling * Fix lint * Make style --------- Co-authored-by: loganhart420 <loganartpersonal@gmail.com> Co-authored-by: Eren Gölge <erogol@hotmail.com>
isikhi · Jul 24, 2023 · 6fdb88f · 6fdb88f
1 parent f24c5e0
commit 6fdb88f
Show file tree

Hide file tree

Showing 41 changed files with 5,202 additions and 5 deletions.
diff --git a/.github/workflows/tts_tests2.yml b/.github/workflows/tts_tests2.yml
@@ -0,0 +1,53 @@
+name: tts-tests2
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9, "3.10", "3.11"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends git make gcc
+          sudo apt-get install espeak
+          sudo apt-get install espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: make test_tts2
diff --git a/Makefile b/Makefile
@@ -19,6 +19,9 @@ test_vocoder:	## run vocoder tests.
 test_tts:	## run tts tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
 
+test_tts2:	## run tts tests.
+	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
+
 test_aux:	## run aux tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
 	./run_bash_tests.sh

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -430,9 +430,9 @@ def main():
     if tts_path is not None:
         wav = synthesizer.tts(
             args.text,
-            args.speaker_idx,
-            args.language_idx,
-            args.speaker_wav,
+            speaker_name=args.speaker_idx,
+            language_name=args.language_idx,
+            speaker_wav=args.speaker_wav,
             reference_wav=args.reference_wav,
             style_wav=args.capacitron_style_wav,
             style_text=args.capacitron_style_text,

diff --git a/TTS/tts/configs/delightful_tts_config.py b/TTS/tts/configs/delightful_tts_config.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.delightful_tts import DelightfulTtsArgs, DelightfulTtsAudioConfig, VocoderConfig
+
+
+@dataclass
+class DelightfulTTSConfig(BaseTTSConfig):
+    """
+    Configuration class for the DelightfulTTS model.
+
+    Attributes:
+        model (str): Name of the model ("delightful_tts").
+        audio (DelightfulTtsAudioConfig): Configuration for audio settings.
+        model_args (DelightfulTtsArgs): Configuration for model arguments.
+        use_attn_priors (bool): Whether to use attention priors.
+        vocoder (VocoderConfig): Configuration for the vocoder.
+        init_discriminator (bool): Whether to initialize the discriminator.
+        steps_to_start_discriminator (int): Number of steps to start the discriminator.
+        grad_clip (List[float]): Gradient clipping values.
+        lr_gen (float): Learning rate for the  gan generator.
+        lr_disc (float): Learning rate for the gan discriminator.
+        lr_scheduler_gen (str): Name of the learning rate scheduler for the generator.
+        lr_scheduler_gen_params (dict): Parameters for the learning rate scheduler for the generator.
+        lr_scheduler_disc (str): Name of the learning rate scheduler for the discriminator.
+        lr_scheduler_disc_params (dict): Parameters for the learning rate scheduler for the discriminator.
+        scheduler_after_epoch (bool): Whether to schedule after each epoch.
+        optimizer (str): Name of the optimizer.
+        optimizer_params (dict): Parameters for the optimizer.
+        ssim_loss_alpha (float): Alpha value for the SSIM loss.
+        mel_loss_alpha (float): Alpha value for the mel loss.
+        aligner_loss_alpha (float): Alpha value for the aligner loss.
+        pitch_loss_alpha (float): Alpha value for the pitch loss.
+        energy_loss_alpha (float): Alpha value for the energy loss.
+        u_prosody_loss_alpha (float): Alpha value for the utterance prosody loss.
+        p_prosody_loss_alpha (float): Alpha value for the phoneme prosody loss.
+        dur_loss_alpha (float): Alpha value for the duration loss.
+        char_dur_loss_alpha (float): Alpha value for the character duration loss.
+        binary_align_loss_alpha (float): Alpha value for the binary alignment loss.
+        binary_loss_warmup_epochs (int): Number of warm-up epochs for the binary loss.
+        disc_loss_alpha (float): Alpha value for the discriminator loss.
+        gen_loss_alpha (float): Alpha value for the generator loss.
+        feat_loss_alpha (float): Alpha value for the feature loss.
+        vocoder_mel_loss_alpha (float): Alpha value for the vocoder mel loss.
+        multi_scale_stft_loss_alpha (float): Alpha value for the multi-scale STFT loss.
+        multi_scale_stft_loss_params (dict): Parameters for the multi-scale STFT loss.
+        return_wav (bool): Whether to return audio waveforms.
+        use_weighted_sampler (bool): Whether to use a weighted sampler.
+        weighted_sampler_attrs (dict): Attributes for the weighted sampler.
+        weighted_sampler_multipliers (dict): Multipliers for the weighted sampler.
+        r (int): Value for the `r` override.
+        compute_f0 (bool): Whether to compute F0 values.
+        f0_cache_path (str): Path to the F0 cache.
+        attn_prior_cache_path (str): Path to the attention prior cache.
+        num_speakers (int): Number of speakers.
+        use_speaker_embedding (bool): Whether to use speaker embedding.
+        speakers_file (str): Path to the speaker file.
+        speaker_embedding_channels (int): Number of channels for the speaker embedding.
+        language_ids_file (str): Path to the language IDs file.
+    """
+
+    model: str = "delightful_tts"
+
+    # model specific params
+    audio: DelightfulTtsAudioConfig = field(default_factory=DelightfulTtsAudioConfig)
+    model_args: DelightfulTtsArgs = field(default_factory=DelightfulTtsArgs)
+    use_attn_priors: bool = True
+
+    # vocoder
+    vocoder: VocoderConfig = field(default_factory=VocoderConfig)
+    init_discriminator: bool = True
+
+    # optimizer
+    steps_to_start_discriminator: int = 200000
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+
+    # acoustic model loss params
+    ssim_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 1.0
+    aligner_loss_alpha: float = 1.0
+    pitch_loss_alpha: float = 1.0
+    energy_loss_alpha: float = 1.0
+    u_prosody_loss_alpha: float = 0.5
+    p_prosody_loss_alpha: float = 0.5
+    dur_loss_alpha: float = 1.0
+    char_dur_loss_alpha: float = 0.01
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 10
+
+    # vocoder loss params
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    vocoder_mel_loss_alpha: float = 10.0
+    multi_scale_stft_loss_alpha: float = 2.5
+    multi_scale_stft_loss_params: dict = field(
+        default_factory=lambda: {
+            "n_ffts": [1024, 2048, 512],
+            "hop_lengths": [120, 240, 50],
+            "win_lengths": [600, 1200, 240],
+        }
+    )
+
+    # data loader params
+    return_wav: bool = True
+    use_weighted_sampler: bool = False
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+    # overrides
+    r: int = 1
+
+    # dataset configs
+    compute_f0: bool = True
+    f0_cache_path: str = None
+    attn_prior_cache_path: str = None
+
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: str = None
+    d_vector_dim: int = None
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "Be a voice, not an echo.",
+            "I'm sorry Dave. I'm afraid I can't do that.",
+            "This cake is great. It's so delicious and moist.",
+            "Prior to November 22, 1963.",
+        ]
+    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
@@ -686,6 +686,7 @@ def __init__(
         self,
         samples: Union[List[List], List[Dict]],
         ap: "AudioProcessor",
+        audio_config=None,  # pylint: disable=unused-argument
         verbose=False,
         cache_path: str = None,
         precompute_num_workers=0,

diff --git a/TTS/tts/layers/delightful_tts/__init__.py b/TTS/tts/layers/delightful_tts/__init__.py