Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement FreeVC #2451

Merged
merged 10 commits into from
Mar 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Tests and relevant updates
  • Loading branch information
erogol committed Mar 23, 2023
commit a598a236e0596fcd086b1b06f28e41ea273a8750
13 changes: 13 additions & 0 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -802,5 +802,18 @@
}
}
}
},
"voice_conversion_models":{
"multilingual":{
"vctk":{
"freevc24":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
"author": "Jing-Yi Li @OlaWod",
"license": "MIT",
"commit": null
}
}
}
}
}
86 changes: 81 additions & 5 deletions TTS/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

from TTS.utils.audio.numpy_transforms import save_wav
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

Expand Down Expand Up @@ -49,11 +50,14 @@ def __init__(
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)

self.synthesizer = None
self.voice_converter = None

if model_name:
self.load_model_by_name(model_name, gpu)
self.load_tts_model_by_name(model_name, gpu)
if model_path:
self.load_model_by_path(
self.load_tts_model_by_path(
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
)

Expand Down Expand Up @@ -96,12 +100,22 @@ def list_models():

def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if model_item["default_vocoder"] is None:
if model_item.get("default_vocoder") is None:
return model_path, config_path, None, None
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
return model_path, config_path, vocoder_path, vocoder_config_path

def load_model_by_name(self, model_name: str, gpu: bool = False):
def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of the voice conversion models by name.

Args:
model_name (str): Model name to load. You can list models by ```tts.models```.
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
model_path, config_path, _, _ = self.download_model_by_name(model_name)
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)

def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
"""Load one of 🐸TTS models by name.

Args:
Expand All @@ -127,7 +141,7 @@ def load_model_by_name(self, model_name: str, gpu: bool = False):
use_cuda=gpu,
)

def load_model_by_path(
def load_tts_model_by_path(
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
):
"""Load a model from a path.
Expand Down Expand Up @@ -219,3 +233,65 @@ def tts_to_file(
"""
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
self.synthesizer.save_wav(wav=wav, path=file_path)

def voice_conversion(
self,
sourve_wav: str,
target_wav: str,
):
"""Voice conversion with FreeVC. Convert source wav to target speaker.

Args:
source_wav (str):
Path to the source wav file.
target_wav (str):
Path to the target wav file.
"""
wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav)
return wav

def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
"""Convert text to speech with voice conversion.

It combines tts with voice conversion to fake voice cloning.

- Convert text to speech with tts.
- Convert the output wav to target speaker with voice conversion.

Args:
text (str):
Input text to synthesize.
language (str, optional):
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
"""
wav = self.tts(text=text, speaker=None, language=language)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=wav, target_wav=speaker_wav)
return wav

def tts_with_vc_to_file(
self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
):
"""Convert text to speech with voice conversion and save to file.

Check `tts_with_vc` for more details.

Args:
text (str):
Input text to synthesize.
language (str, optional):
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
"""
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
80 changes: 62 additions & 18 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ def main():
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```

### Voice Conversion Models

```
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
```
"""
# We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
# documentation in sync more easily.
Expand Down Expand Up @@ -245,6 +251,20 @@ def main():
default=True,
)

# voice conversion args
parser.add_argument(
"--source_wav",
type=str,
default=None,
help="Original audio file to convert in the voice of the target_wav",
)
parser.add_argument(
"--target_wav",
type=str,
default=None,
help="Target audio file to convert in the voice of the source_wav",
)

args = parser.parse_args()

# print the description if either text or list_models is not set
Expand All @@ -256,6 +276,8 @@ def main():
args.reference_wav,
args.model_info_by_idx,
args.model_info_by_name,
args.source_wav,
args.target_wav,
]
if not any(check_args):
parser.parse_args(["-h"])
Expand All @@ -264,21 +286,23 @@ def main():
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)

model_path = None
config_path = None
tts_path = None
tts_config_path = None
speakers_file_path = None
language_ids_file_path = None
vocoder_path = None
vocoder_config_path = None
encoder_path = None
encoder_config_path = None
vc_path = None
vc_config_path = None

# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.list_models()
sys.exit()

# CASE2 #info : model info of pre-trained TTS models
# CASE2 #info : model info for pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
Expand All @@ -292,15 +316,27 @@ def main():
# CASE3: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

# tts model
if model_item["model_type"] == "tts_models":
tts_path = model_path
tts_config_path = config_path
if "default_vocoder" in model_item:
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

# voice conversion model
if model_item["model_type"] == "voice_conversion_models":
vc_path = model_path
vc_config_path = config_path

# load vocoder
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

# CASE4: set custom model paths
if args.model_path is not None:
model_path = args.model_path
config_path = args.config_path
tts_path = args.model_path
tts_config_path = args.config_path
speakers_file_path = args.speakers_file_path
language_ids_file_path = args.language_ids_file_path

Expand All @@ -314,14 +350,16 @@ def main():

# load models
synthesizer = Synthesizer(
model_path,
config_path,
tts_path,
tts_config_path,
speakers_file_path,
language_ids_file_path,
vocoder_path,
vocoder_config_path,
encoder_path,
encoder_config_path,
vc_path,
vc_config_path,
args.use_cuda,
)

Expand Down Expand Up @@ -354,16 +392,22 @@ def main():
print(" > Text: {}".format(args.text))

# kick it
wav = synthesizer.tts(
args.text,
args.speaker_idx,
args.language_idx,
args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
if tts_path is not None:
wav = synthesizer.tts(
args.text,
args.speaker_idx,
args.language_idx,
args.speaker_wav,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
)
elif vc_path is not None:
wav = synthesizer.voice_conversion(
source_wav=args.source_wav,
target_wav=args.target_wav,
)

# save the results
print(" > Saving output to {}".format(args.out_path))
Expand Down
2 changes: 1 addition & 1 deletion TTS/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def register_config(model_name: str) -> Coqpit:
"""
config_class = None
config_name = model_name + "_config"
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
for path in paths:
try:
config_class = find_module(path, config_name)
Expand Down
2 changes: 2 additions & 0 deletions TTS/tts/models/base_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class BaseTTS(BaseTrainerModel):
It defines common `tts` specific functions on top of `Model` implementation.
"""

MODEL_TYPE = "tts"

def __init__(
self,
config: Coqpit,
Expand Down
1 change: 1 addition & 0 deletions TTS/utils/generic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def to_camel(text):
text = text.capitalize()
text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
text = text.replace("Tts", "TTS")
text = text.replace("vc", "VC")
return text


Expand Down
8 changes: 8 additions & 0 deletions TTS/utils/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,13 @@ def list_vocoder_models(self):
"""
return self._list_for_model_type("vocoder_models")

def list_vc_models(self):
"""Print all the voice conversion models and return a list of model names

Format is `language/dataset/model`
"""
return self._list_for_model_type("voice_conversion_models")

def list_langs(self):
"""Print all the available languages"""
print(" Name format: type/language")
Expand Down Expand Up @@ -234,6 +241,7 @@ def download_model(self, model_name):
model_type, lang, dataset, model = model_name.split("/")
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
model_item = self.models_dict[model_type][lang][dataset][model]
model_item["model_type"] = model_type
# set the model specific output path
output_path = os.path.join(self.output_prefix, model_full_name)
if os.path.exists(output_path):
Expand Down
Loading