-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Training recipes for thorsten dataset (#1020)
* Fix style * Fix isort * Remove tensorboardX from requirements Co-authored-by: logan hart <72301874+loganhart420@users.noreply.github.com> Co-authored-by: Eren Gölge <egolge@coqui.ai>
- Loading branch information
1 parent
71111d1
commit a790df4
Showing
17 changed files
with
822 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -166,4 +166,4 @@ internal/* | |
*_phoneme.npy | ||
wandb | ||
depot/* | ||
coqui_recipes/* | ||
coqui_recipes/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# 🐸💬 TTS Thorsten Recipes | ||
|
||
For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset. | ||
|
||
You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present. | ||
|
||
Then, go to your desired model folder and run the training. | ||
|
||
Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) | ||
```terminal | ||
CUDA_VISIBLE_DEVICES="0" python train_modelX.py | ||
``` | ||
|
||
💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best | ||
result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import os | ||
|
||
from trainer import Trainer, TrainerArgs | ||
|
||
from TTS.tts.configs.align_tts_config import AlignTTSConfig | ||
from TTS.tts.configs.shared_configs import BaseDatasetConfig | ||
from TTS.tts.datasets import load_tts_samples | ||
from TTS.tts.models.align_tts import AlignTTS | ||
from TTS.tts.utils.text.tokenizer import TTSTokenizer | ||
from TTS.utils.audio import AudioProcessor | ||
from TTS.utils.downloaders import download_thorsten_de | ||
|
||
output_path = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
# init configs | ||
dataset_config = BaseDatasetConfig( | ||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") | ||
) | ||
|
||
# download dataset if not already present | ||
if not os.path.exists(dataset_config.path): | ||
print("Downloading dataset") | ||
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0]) | ||
|
||
config = AlignTTSConfig( | ||
batch_size=32, | ||
eval_batch_size=16, | ||
num_loader_workers=4, | ||
num_eval_loader_workers=4, | ||
run_eval=True, | ||
test_delay_epochs=-1, | ||
epochs=1000, | ||
text_cleaner="phoneme_cleaners", | ||
use_phonemes=False, | ||
phoneme_language="de", | ||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), | ||
print_step=25, | ||
print_eval=True, | ||
mixed_precision=False, | ||
test_sentences=[ | ||
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.", | ||
"Sei eine Stimme, kein Echo.", | ||
"Es tut mir Leid David. Das kann ich leider nicht machen.", | ||
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.", | ||
"Vor dem 22. November 1963.", | ||
], | ||
output_path=output_path, | ||
datasets=[dataset_config], | ||
) | ||
|
||
# INITIALIZE THE AUDIO PROCESSOR | ||
# Audio processor is used for feature extraction and audio I/O. | ||
# It mainly serves to the dataloader and the training loggers. | ||
ap = AudioProcessor.init_from_config(config) | ||
|
||
# INITIALIZE THE TOKENIZER | ||
# Tokenizer is used to convert text to sequences of token IDs. | ||
# If characters are not defined in the config, default characters are passed to the config | ||
tokenizer, config = TTSTokenizer.init_from_config(config) | ||
|
||
# LOAD DATA SAMPLES | ||
# Each sample is a list of ```[text, audio_file_path, speaker_name]``` | ||
# You can define your custom sample loader returning the list of samples. | ||
# Or define your custom formatter and pass it to the `load_tts_samples`. | ||
# Check `TTS.tts.datasets.load_tts_samples` for more details. | ||
train_samples, eval_samples = load_tts_samples( | ||
dataset_config, | ||
eval_split=True, | ||
eval_split_max_size=config.eval_split_max_size, | ||
eval_split_size=config.eval_split_size, | ||
) | ||
|
||
# init model | ||
model = AlignTTS(config, ap, tokenizer) | ||
|
||
# INITIALIZE THE TRAINER | ||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, | ||
# distributed training, etc. | ||
trainer = Trainer( | ||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples | ||
) | ||
|
||
# AND... 3,2,1... 🚀 | ||
trainer.fit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# create venv | ||
python3 -m venv env | ||
source .env/bin/activate | ||
pip install pip --upgrade | ||
|
||
# download Thorsten_DE dataset | ||
pip install gdown | ||
gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz | ||
tar -xzf dataset.tgz | ||
|
||
# create train-val splits | ||
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv | ||
head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv | ||
tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv | ||
|
||
# rename dataset and remove archive | ||
mv LJSpeech-1.1 thorsten-de | ||
rm dataset.tgz | ||
|
||
# destry venv | ||
rm -rf env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os | ||
|
||
# Trainer: Where the ✨️ happens. | ||
# TrainingArgs: Defines the set of arguments of the Trainer. | ||
from trainer import Trainer, TrainerArgs | ||
|
||
# GlowTTSConfig: all model related values for training, validating and testing. | ||
from TTS.tts.configs.glow_tts_config import GlowTTSConfig | ||
|
||
# BaseDatasetConfig: defines name, formatter and path of the dataset. | ||
from TTS.tts.configs.shared_configs import BaseDatasetConfig | ||
from TTS.tts.datasets import load_tts_samples | ||
from TTS.tts.models.glow_tts import GlowTTS | ||
from TTS.tts.utils.text.tokenizer import TTSTokenizer | ||
from TTS.utils.audio import AudioProcessor | ||
from TTS.utils.downloaders import download_thorsten_de | ||
|
||
# we use the same path as this script as our training folder. | ||
output_path = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
# DEFINE DATASET CONFIG | ||
# Set LJSpeech as our target dataset and define its path. | ||
# You can also use a simple Dict to define the dataset and pass it to your custom formatter. | ||
dataset_config = BaseDatasetConfig( | ||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") | ||
) | ||
|
||
# download dataset if not already present | ||
if not os.path.exists(dataset_config.path): | ||
print("Downloading dataset") | ||
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0]) | ||
|
||
# INITIALIZE THE TRAINING CONFIGURATION | ||
# Configure the model. Every config class inherits the BaseTTSConfig. | ||
config = GlowTTSConfig( | ||
batch_size=32, | ||
eval_batch_size=16, | ||
num_loader_workers=4, | ||
num_eval_loader_workers=4, | ||
run_eval=True, | ||
test_delay_epochs=-1, | ||
epochs=1000, | ||
text_cleaner="phoneme_cleaners", | ||
use_phonemes=True, | ||
phoneme_language="de", | ||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), | ||
print_step=25, | ||
print_eval=False, | ||
mixed_precision=True, | ||
test_sentences=[ | ||
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.", | ||
"Sei eine Stimme, kein Echo.", | ||
"Es tut mir Leid David. Das kann ich leider nicht machen.", | ||
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.", | ||
"Vor dem 22. November 1963.", | ||
], | ||
output_path=output_path, | ||
datasets=[dataset_config], | ||
) | ||
|
||
# INITIALIZE THE AUDIO PROCESSOR | ||
# Audio processor is used for feature extraction and audio I/O. | ||
# It mainly serves to the dataloader and the training loggers. | ||
ap = AudioProcessor.init_from_config(config) | ||
|
||
# INITIALIZE THE TOKENIZER | ||
# Tokenizer is used to convert text to sequences of token IDs. | ||
# If characters are not defined in the config, default characters are passed to the config | ||
tokenizer, config = TTSTokenizer.init_from_config(config) | ||
|
||
# LOAD DATA SAMPLES | ||
# Each sample is a list of ```[text, audio_file_path, speaker_name]``` | ||
# You can define your custom sample loader returning the list of samples. | ||
# Or define your custom formatter and pass it to the `load_tts_samples`. | ||
# Check `TTS.tts.datasets.load_tts_samples` for more details. | ||
train_samples, eval_samples = load_tts_samples( | ||
dataset_config, | ||
eval_split=True, | ||
eval_split_max_size=config.eval_split_max_size, | ||
eval_split_size=config.eval_split_size, | ||
) | ||
|
||
# INITIALIZE THE MODEL | ||
# Models take a config object and a speaker manager as input | ||
# Config defines the details of the model like the number of layers, the size of the embedding, etc. | ||
# Speaker manager is used by multi-speaker models. | ||
model = GlowTTS(config, ap, tokenizer, speaker_manager=None) | ||
|
||
# INITIALIZE THE TRAINER | ||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, | ||
# distributed training, etc. | ||
trainer = Trainer( | ||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples | ||
) | ||
|
||
# AND... 3,2,1... 🚀 | ||
trainer.fit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
|
||
from trainer import Trainer, TrainerArgs | ||
|
||
from TTS.utils.audio import AudioProcessor | ||
from TTS.utils.downloaders import download_thorsten_de | ||
from TTS.vocoder.configs import HifiganConfig | ||
from TTS.vocoder.datasets.preprocess import load_wav_data | ||
from TTS.vocoder.models.gan import GAN | ||
|
||
output_path = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
config = HifiganConfig( | ||
batch_size=32, | ||
eval_batch_size=16, | ||
num_loader_workers=4, | ||
num_eval_loader_workers=4, | ||
run_eval=True, | ||
test_delay_epochs=5, | ||
epochs=1000, | ||
seq_len=8192, | ||
pad_short=2000, | ||
use_noise_augment=True, | ||
eval_split_size=10, | ||
print_step=25, | ||
print_eval=False, | ||
mixed_precision=False, | ||
lr_gen=1e-4, | ||
lr_disc=1e-4, | ||
data_path=os.path.join(output_path, "../thorsten-de/wavs/"), | ||
output_path=output_path, | ||
) | ||
|
||
# download dataset if not already present | ||
if not os.path.exists(config.data_path): | ||
print("Downloading dataset") | ||
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) | ||
download_thorsten_de(download_path) | ||
|
||
# init audio processor | ||
ap = AudioProcessor(**config.audio.to_dict()) | ||
|
||
# load training samples | ||
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) | ||
|
||
# init model | ||
model = GAN(config, ap) | ||
|
||
# init the trainer and 🚀 | ||
trainer = Trainer( | ||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples | ||
) | ||
trainer.fit() |
53 changes: 53 additions & 0 deletions
53
recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
|
||
from trainer import Trainer, TrainerArgs | ||
|
||
from TTS.utils.audio import AudioProcessor | ||
from TTS.utils.downloaders import download_thorsten_de | ||
from TTS.vocoder.configs import MultibandMelganConfig | ||
from TTS.vocoder.datasets.preprocess import load_wav_data | ||
from TTS.vocoder.models.gan import GAN | ||
|
||
output_path = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
config = MultibandMelganConfig( | ||
batch_size=32, | ||
eval_batch_size=16, | ||
num_loader_workers=4, | ||
num_eval_loader_workers=4, | ||
run_eval=True, | ||
test_delay_epochs=5, | ||
epochs=1000, | ||
seq_len=8192, | ||
pad_short=2000, | ||
use_noise_augment=True, | ||
eval_split_size=10, | ||
print_step=25, | ||
print_eval=False, | ||
mixed_precision=False, | ||
lr_gen=1e-4, | ||
lr_disc=1e-4, | ||
data_path=os.path.join(output_path, "../thorsten-de/wavs/"), | ||
output_path=output_path, | ||
) | ||
|
||
# download dataset if not already present | ||
if not os.path.exists(config.data_path): | ||
print("Downloading dataset") | ||
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) | ||
download_thorsten_de(download_path) | ||
|
||
# init audio processor | ||
ap = AudioProcessor(**config.audio.to_dict()) | ||
|
||
# load training samples | ||
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) | ||
|
||
# init model | ||
model = GAN(config, ap) | ||
|
||
# init the trainer and 🚀 | ||
trainer = Trainer( | ||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples | ||
) | ||
trainer.fit() |
Oops, something went wrong.