Add missings

coqui-ai · erogol · Mar 25, 2023 · Mar 23, 2023 · Mar 23, 2023 · Mar 23, 2023
commit fa24171f2b0328e7738547764f5860bbd105aecc
diff --git a/TTS/vc/configs/__init__.py b/TTS/vc/configs/__init__.py
diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py
@@ -0,0 +1,5 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
diff --git a/TTS/vc/modules/__init__.py b/TTS/vc/modules/__init__.py
diff --git a/TTS/vc/modules/freevc/__init__.py b/TTS/vc/modules/freevc/__init__.py
diff --git a/TTS/vc/modules/freevc/speaker_encoder/__init__.py b/TTS/vc/modules/freevc/speaker_encoder/__init__.py
diff --git a/TTS/vc/modules/freevc/speaker_encoder/hparams.py b/TTS/vc/modules/freevc/speaker_encoder/hparams.py
@@ -0,0 +1,31 @@
+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10  # In milliseconds
+mel_n_channels = 40
+
+
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160  # 1600 ms
+
+
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+
+
+## Audio volume normalization
+audio_norm_target_dBFS = -30
+
+
+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
diff --git a/TTS/vc/modules/freevc/wavlm/config.json b/TTS/vc/modules/freevc/wavlm/config.json
@@ -0,0 +1,99 @@
+{
+    "_name_or_path": "./wavlm-large/",
+    "activation_dropout": 0.0,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "WavLMModel"
+    ],
+    "attention_dropout": 0.1,
+    "bos_token_id": 1,
+    "classifier_proj_size": 256,
+    "codevector_dim": 768,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": true,
+    "eos_token_id": 2,
+    "feat_extract_activation": "gelu",
+    "feat_extract_dropout": 0.0,
+    "feat_extract_norm": "layer",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.075,
+    "mask_time_selection": "static",
+    "max_bucket_distance": 800,
+    "model_type": "wavlm",
+    "num_adapter_layers": 3,
+    "num_attention_heads": 16,
+    "num_buckets": 320,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_ctc_classes": 80,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 24,
+    "num_negatives": 100,
+    "output_hidden_size": 1024,
+    "pad_token_id": 0,
+    "proj_codevector_dim": 768,
+    "replace_prob": 0.5,
+    "tokenizer_class": "Wav2Vec2CTCTokenizer",
+    "torch_dtype": "float32",
+    "transformers_version": "4.15.0.dev0",
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32
+  }
diff --git a/tests/vc_tests/__init__.py b/tests/vc_tests/__init__.py