Skip to content

Commit

Permalink
Training recipes for thorsten dataset (#1020)
Browse files Browse the repository at this point in the history
* Fix style

* Fix isort

* Remove tensorboardX from requirements

Co-authored-by: logan hart <72301874+loganhart420@users.noreply.github.com>
Co-authored-by: Eren Gölge <egolge@coqui.ai>
  • Loading branch information
3 people authored May 30, 2022
1 parent 71111d1 commit a790df4
Show file tree
Hide file tree
Showing 17 changed files with 822 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@ jobs:
platforms: linux/${{ matrix.arch }}
push: ${{ github.event_name == 'push' }}
build-args: "BASE=${{ matrix.base }}"
tags: ${{ steps.compute-tag.outputs.tags }}
tags: ${{ steps.compute-tag.outputs.tags }}
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,4 @@ internal/*
*_phoneme.npy
wandb
depot/*
coqui_recipes/*
coqui_recipes/*
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.
COPY . /root
RUN make install
ENTRYPOINT ["tts"]
CMD ["--help"]
CMD ["--help"]
15 changes: 15 additions & 0 deletions TTS/tts/datasets/formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,21 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg
return items


def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the thorsten meta data file to TTS format
https://github.com/thorstenMueller/deep-learning-german-tts/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "thorsten"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items


def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the sam-accenture meta data file to TTS format
https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
Expand Down
15 changes: 15 additions & 0 deletions recipes/thorsten_DE/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# 🐸💬 TTS Thorsten Recipes

For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.

You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.

Then, go to your desired model folder and run the training.

Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
```terminal
CUDA_VISIBLE_DEVICES="0" python train_modelX.py
```

💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
84 changes: 84 additions & 0 deletions recipes/thorsten_DE/align_tts/train_aligntts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.align_tts_config import AlignTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.align_tts import AlignTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de

output_path = os.path.dirname(os.path.abspath(__file__))

# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)

# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])

config = AlignTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=False,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=True,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)

# init model
model = AlignTTS(config, ap, tokenizer)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1... 🚀
trainer.fit()
21 changes: 21 additions & 0 deletions recipes/thorsten_DE/download_thorsten_DE.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# create venv
python3 -m venv env
source .env/bin/activate
pip install pip --upgrade

# download Thorsten_DE dataset
pip install gdown
gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
tar -xzf dataset.tgz

# create train-val splits
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv

# rename dataset and remove archive
mv LJSpeech-1.1 thorsten-de
rm dataset.tgz

# destry venv
rm -rf env
97 changes: 97 additions & 0 deletions recipes/thorsten_DE/glow_tts/train_glowtts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os

# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs

# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de

# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.path.abspath(__file__))

# DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)

# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])

# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1... 🚀
trainer.fit()
53 changes: 53 additions & 0 deletions recipes/thorsten_DE/hifigan/train_hifigan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os

from trainer import Trainer, TrainerArgs

from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN

output_path = os.path.dirname(os.path.abspath(__file__))

config = HifiganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)

# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

# init model
model = GAN(config, ap)

# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()
53 changes: 53 additions & 0 deletions recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os

from trainer import Trainer, TrainerArgs

from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import MultibandMelganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN

output_path = os.path.dirname(os.path.abspath(__file__))

config = MultibandMelganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)

# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

# init model
model = GAN(config, ap)

# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()
Loading

0 comments on commit a790df4

Please sign in to comment.