Skip to content

Commit

Permalink
reformatting and styling
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol committed Apr 12, 2021
1 parent 9011ddd commit f519012
Show file tree
Hide file tree
Showing 159 changed files with 6,605 additions and 6,445 deletions.
28 changes: 28 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
.DEFAULT_GOAL := help
.PHONY: test deps style lint install help

help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

target_dirs := tests TTS notebooks

system-deps: ## install linux system deps
sudo apt-get install -y espeak-ng
sudo apt-get install -y libsndfile1-dev

deps: ## install 🐸 requirements.
pip install -r requirements.txt

test: ## run tests.
nosetests --with-cov -cov --cover-erase --cover-package TTS tests
./run_bash_tests.sh

style: ## update code style.
black ${target_dirs}
isort ${target_dirs}

lint: ## run pylint linter.
pylint ${target_dirs}

install: ## install 🐸 TTS for development.
pip install -e .
135 changes: 66 additions & 69 deletions TTS/bin/compute_attention_masks.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,28 @@
import argparse
import importlib
import os
from argparse import RawTextHelpFormatter

import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from argparse import RawTextHelpFormatter

from TTS.tts.datasets.TTSDataset import MyDataset
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_checkpoint
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config


if __name__ == '__main__':
if __name__ == "__main__":
# pylint: disable=bad-continuation
parser = argparse.ArgumentParser(
description='''Extract attention masks from trained Tacotron/Tacotron2 models.
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n'''

'''Each attention mask is written to the same path as the input wav file with ".npy" file extension.
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n'''

'''
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
"""Each attention mask is written to the same path as the input wav file with ".npy" file extension.
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
"""
Example run:
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
Expand All @@ -34,53 +32,44 @@
--batch_size 32
--dataset ljspeech
--use_cuda True
''',
formatter_class=RawTextHelpFormatter
)
parser.add_argument('--model_path',
type=str,
required=True,
help='Path to Tacotron/Tacotron2 model file ')
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
parser.add_argument(
'--config_path',
"--config_path",
type=str,
required=True,
help='Path to Tacotron/Tacotron2 config file.',
help="Path to Tacotron/Tacotron2 config file.",
)
parser.add_argument('--dataset',
type=str,
default='',
required=True,
help='Target dataset processor name from TTS.tts.dataset.preprocess.')

parser.add_argument(
'--dataset_metafile',
"--dataset",
type=str,
default='',
default="",
required=True,
help='Dataset metafile inclusing file paths with transcripts.')
help="Target dataset processor name from TTS.tts.dataset.preprocess.",
)

parser.add_argument(
'--data_path',
"--dataset_metafile",
type=str,
default='',
help='Defines the data path. It overwrites config.json.')
parser.add_argument('--use_cuda',
type=bool,
default=False,
help="enable/disable cuda.")
default="",
required=True,
help="Dataset metafile inclusing file paths with transcripts.",
)
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")

parser.add_argument(
'--batch_size',
default=16,
type=int,
help='Batch size for the model. Use batch_size=1 if you have no CUDA.')
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
)
args = parser.parse_args()

C = load_config(args.config_path)
ap = AudioProcessor(**C.audio)

# if the vocabulary was passed, replace the default
if 'characters' in C.keys():
if "characters" in C.keys():
symbols, phonemes = make_symbols(**C.characters)

# load the model
Expand All @@ -91,28 +80,32 @@
model.eval()

# data loader
preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')
preprocessor = importlib.import_module("TTS.tts.datasets.preprocess")
preprocessor = getattr(preprocessor, args.dataset)
meta_data = preprocessor(args.data_path, args.dataset_metafile)
dataset = MyDataset(model.decoder.r,
C.text_cleaner,
compute_linear_spec=False,
ap=ap,
meta_data=meta_data,
tp=C.characters if 'characters' in C.keys() else None,
add_blank=C['add_blank'] if 'add_blank' in C.keys() else False,
use_phonemes=C.use_phonemes,
phoneme_cache_path=C.phoneme_cache_path,
phoneme_language=C.phoneme_language,
enable_eos_bos=C.enable_eos_bos_chars)
dataset = MyDataset(
model.decoder.r,
C.text_cleaner,
compute_linear_spec=False,
ap=ap,
meta_data=meta_data,
tp=C.characters if "characters" in C.keys() else None,
add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
use_phonemes=C.use_phonemes,
phoneme_cache_path=C.phoneme_cache_path,
phoneme_language=C.phoneme_language,
enable_eos_bos=C.enable_eos_bos_chars,
)

dataset.sort_items()
loader = DataLoader(dataset,
batch_size=args.batch_size,
num_workers=4,
collate_fn=dataset.collate_fn,
shuffle=False,
drop_last=False)
loader = DataLoader(
dataset,
batch_size=args.batch_size,
num_workers=4,
collate_fn=dataset.collate_fn,
shuffle=False,
drop_last=False,
)

# compute attentions
file_paths = []
Expand All @@ -134,25 +127,29 @@
mel_input = mel_input.cuda()
mel_lengths = mel_lengths.cuda()

mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(
text_input, text_lengths, mel_input)
mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)

alignments = alignments.detach()
for idx, alignment in enumerate(alignments):
item_idx = item_idxs[idx]
# interpolate if r > 1
alignment = torch.nn.functional.interpolate(
alignment.transpose(0, 1).unsqueeze(0),
size=None,
scale_factor=model.decoder.r,
mode='nearest',
align_corners=None,
recompute_scale_factor=None).squeeze(0).transpose(0, 1)
alignment = (
torch.nn.functional.interpolate(
alignment.transpose(0, 1).unsqueeze(0),
size=None,
scale_factor=model.decoder.r,
mode="nearest",
align_corners=None,
recompute_scale_factor=None,
)
.squeeze(0)
.transpose(0, 1)
)
# remove paddings
alignment = alignment[:mel_lengths[idx], :text_lengths[idx]].cpu().numpy()
alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
# set file paths
wav_file_name = os.path.basename(item_idx)
align_file_name = os.path.splitext(wav_file_name)[0] + '.npy'
align_file_name = os.path.splitext(wav_file_name)[0] + ".npy"
file_path = item_idx.replace(wav_file_name, align_file_name)
# save output
file_paths.append([item_idx, file_path])
Expand Down
87 changes: 34 additions & 53 deletions TTS/bin/compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,101 +3,82 @@
import os

import numpy as np
import torch
from tqdm import tqdm

import torch
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.utils.speakers import save_speaker_mapping
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.tts.utils.speakers import save_speaker_mapping
from TTS.tts.datasets.preprocess import load_meta_data

parser = argparse.ArgumentParser(
description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.')
parser.add_argument(
'model_path',
type=str,
help='Path to model outputs (checkpoint, tensorboard etc.).')
parser.add_argument(
'config_path',
type=str,
help='Path to config file for training.',
description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.'
)
parser.add_argument("model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.).")
parser.add_argument(
'data_path',
type=str,
help='Data path for wav files - directory or CSV file')
parser.add_argument(
'output_path',
type=str,
help='path for training outputs.')
parser.add_argument(
'--target_dataset',
"config_path",
type=str,
default='',
help='Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.'
)
parser.add_argument(
'--use_cuda', type=bool, help='flag to set cuda.', default=False
help="Path to config file for training.",
)
parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file")
parser.add_argument("output_path", type=str, help="path for training outputs.")
parser.add_argument(
'--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|'
"--target_dataset",
type=str,
default="",
help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.",
)
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|")
args = parser.parse_args()


c = load_config(args.config_path)
ap = AudioProcessor(**c['audio'])
ap = AudioProcessor(**c["audio"])

data_path = args.data_path
split_ext = os.path.splitext(data_path)
sep = args.separator

if args.target_dataset != '':
if args.target_dataset != "":
# if target dataset is defined
dataset_config = [
{
"name": args.target_dataset,
"path": args.data_path,
"meta_file_train": None,
"meta_file_val": None
},
{"name": args.target_dataset, "path": args.data_path, "meta_file_train": None, "meta_file_val": None},
]
wav_files, _ = load_meta_data(dataset_config, eval_split=False)
output_files = [wav_file[1].replace(data_path, args.output_path).replace(
'.wav', '.npy') for wav_file in wav_files]
output_files = [wav_file[1].replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files]
else:
# if target dataset is not defined
if len(split_ext) > 0 and split_ext[1].lower() == '.csv':
if len(split_ext) > 0 and split_ext[1].lower() == ".csv":
# Parse CSV
print(f'CSV file: {data_path}')
print(f"CSV file: {data_path}")
with open(data_path) as f:
wav_path = os.path.join(os.path.dirname(data_path), 'wavs')
wav_path = os.path.join(os.path.dirname(data_path), "wavs")
wav_files = []
print(f'Separator is: {sep}')
print(f"Separator is: {sep}")
for line in f:
components = line.split(sep)
if len(components) != 2:
print("Invalid line")
continue
wav_file = os.path.join(wav_path, components[0] + '.wav')
#print(f'wav_file: {wav_file}')
wav_file = os.path.join(wav_path, components[0] + ".wav")
# print(f'wav_file: {wav_file}')
if os.path.exists(wav_file):
wav_files.append(wav_file)
print(f'Count of wavs imported: {len(wav_files)}')
print(f"Count of wavs imported: {len(wav_files)}")
else:
# Parse all wav files in data_path
wav_files = glob.glob(data_path + '/**/*.wav', recursive=True)
wav_files = glob.glob(data_path + "/**/*.wav", recursive=True)

output_files = [wav_file.replace(data_path, args.output_path).replace(
'.wav', '.npy') for wav_file in wav_files]
output_files = [wav_file.replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files]

for output_file in output_files:
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# define Encoder model
model = SpeakerEncoder(**c.model)
model.load_state_dict(torch.load(args.model_path)['model'])
model.load_state_dict(torch.load(args.model_path)["model"])
model.eval()
if args.use_cuda:
model.cuda()
Expand All @@ -117,14 +98,14 @@
embedd = embedd.detach().cpu().numpy()
np.save(output_files[idx], embedd)

if args.target_dataset != '':
if args.target_dataset != "":
# create speaker_mapping if target dataset is defined
wav_file_name = os.path.basename(wav_file)
speaker_mapping[wav_file_name] = {}
speaker_mapping[wav_file_name]['name'] = speaker_name
speaker_mapping[wav_file_name]['embedding'] = embedd.flatten().tolist()
speaker_mapping[wav_file_name]["name"] = speaker_name
speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist()

if args.target_dataset != '':
if args.target_dataset != "":
# save speaker_mapping if target dataset is defined
mapping_file_path = os.path.join(args.output_path, 'speakers.json')
mapping_file_path = os.path.join(args.output_path, "speakers.json")
save_speaker_mapping(args.output_path, speaker_mapping)
Loading

0 comments on commit f519012

Please sign in to comment.