Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement FreeVC #2451

Merged
merged 10 commits into from
Mar 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add missings
  • Loading branch information
erogol committed Mar 24, 2023
commit fa24171f2b0328e7738547764f5860bbd105aecc
Empty file added TTS/vc/configs/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions TTS/vc/configs/freevc_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from dataclasses import dataclass, field
from typing import List

from TTS.vc.configs.shared_configs import BaseVCConfig
from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
Empty file added TTS/vc/modules/__init__.py
Empty file.
Empty file.
Empty file.
31 changes: 31 additions & 0 deletions TTS/vc/modules/freevc/speaker_encoder/hparams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
## Mel-filterbank
mel_window_length = 25 # In milliseconds
mel_window_step = 10 # In milliseconds
mel_n_channels = 40


## Audio
sampling_rate = 16000
# Number of spectrogram frames in a partial utterance
partials_n_frames = 160 # 1600 ms


## Voice Activation Detection
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
# This sets the granularity of the VAD. Should not need to be changed.
vad_window_length = 30 # In milliseconds
# Number of frames to average together when performing the moving average smoothing.
# The larger this value, the larger the VAD variations must be to not get smoothed out.
vad_moving_average_width = 8
# Maximum number of consecutive silent frames a segment can have.
vad_max_silence_length = 6


## Audio volume normalization
audio_norm_target_dBFS = -30


## Model parameters
model_hidden_size = 256
model_embedding_size = 256
model_num_layers = 3
99 changes: 99 additions & 0 deletions TTS/vc/modules/freevc/wavlm/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
{
"_name_or_path": "./wavlm-large/",
"activation_dropout": 0.0,
"adapter_kernel_size": 3,
"adapter_stride": 2,
"add_adapter": false,
"apply_spec_augment": true,
"architectures": [
"WavLMModel"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 768,
"contrastive_logits_temperature": 0.1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.0,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.075,
"mask_time_selection": "static",
"max_bucket_distance": 800,
"model_type": "wavlm",
"num_adapter_layers": 3,
"num_attention_heads": 16,
"num_buckets": 320,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_ctc_classes": 80,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"output_hidden_size": 1024,
"pad_token_id": 0,
"proj_codevector_dim": 768,
"replace_prob": 0.5,
"tokenizer_class": "Wav2Vec2CTCTokenizer",
"torch_dtype": "float32",
"transformers_version": "4.15.0.dev0",
"use_weighted_layer_sum": false,
"vocab_size": 32
}
Empty file added tests/vc_tests/__init__.py
Empty file.