Skip to content

Commit

Permalink
F5-TTS Engine/Settings changes
Browse files Browse the repository at this point in the history
  • Loading branch information
erew123 authored Nov 4, 2024
1 parent bcd84a4 commit 5d43b23
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 32 deletions.
18 changes: 12 additions & 6 deletions system/tts_engines/f5tts/f5tts_settings_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def f5tts_model_alltalk_settings(model_config_data):
temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
generationspeed_set_gr = gr.Slider(value=float(model_config_data["settings"]["generationspeed_set"]), minimum=0.25, maximum=2.00, step=0.25, label="Speed" if model_config_data["model_capabilties"]["generationspeed_capable"] else "Speed N/A", interactive=model_config_data["model_capabilties"]["generationspeed_capable"])
generationspeed_set_gr = gr.Slider(value=float(model_config_data["settings"]["generationspeed_set"]), minimum=0.30, maximum=2.00, step=0.10, label="Speed" if model_config_data["model_capabilties"]["generationspeed_capable"] else "Speed N/A", interactive=model_config_data["model_capabilties"]["generationspeed_capable"])
with gr.Row():
with gr.Column():
gr.Markdown("### OpenAI Voice Mappings")
Expand Down Expand Up @@ -467,13 +467,13 @@ def refresh_file_list():
with gr.Row():
gr.Markdown("""
### 🟧 Where are the f5tts models stored?
This extension will download the models to `/alltalk_tts/models/f5tts/` folder.
This extension will download the f5tts models to `/alltalk_tts/models/f5tts/` folder.
### 🟧 How do reference voices work?
### 🟧 How do clone/reference voices work?
F5-TTS uses voice samples with corresponding reference text files for voice cloning:
1. Place your WAV voice samples in the `/alltalk_tts/voices/` folder
2. Create a matching `.reference.txt` file containing the exact text spoken in the recording
3. Use the Reference Text Manager tab to create and manage these text files
2. Create a matching `.reference.txt` file containing the exact text spoken in the recording.
3. Use the `Reference Text/Sample Manager` tab to create and manage these text files
4. Only voice samples with valid reference text files will be available for use
For best results:
Expand All @@ -487,7 +487,13 @@ def refresh_file_list():
> When `Disabled`, your output wav files will be left untouched.
> When set to a setting `1 Day` or greater, your output wav files older than that time period will be automatically deleted on start-up of AllTalk.
""")

gr.Markdown("""
### 🟧 Speed settings and voice cloning
F5-TTS appears to generate audio a little faster than the audio sample. As such, in AllTalk, the speed setting for generation is set to 0.9 (see the F5-TTS `Default Settings` tab). Certainly the better the punctuation you use, the better the end result, both in the audio samples reference file and the text you send to be generated as TTS. If you wish to globally adjust this, you can make the change on the Default Settings tab.
### 🟧 Cloned audio and punctuation.
When F5-TTS reproduces audio, punctuation appears to matter greatly. As such `Im` and `I'm` or `I am` will all sound a little different. Additionally it appears to want to spell out text in CAPS. You may want to look at the F5-TTS developers page for more information on this.
""")

return app

Expand Down
38 changes: 15 additions & 23 deletions system/tts_engines/f5tts/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,19 @@

def install_and_restart():
try:
print("##########################################")
print("F5-TTS not found. Attempting to install...")
print("##########################################")
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"git+https://github.com/SWivid/F5-TTS.git"
])
print("########################################################")
print("F5-TTS installed successfully! Restarting application...")
print("########################################################")

# Get the current script's path
script_path = sys.argv[0]
Expand All @@ -56,7 +60,9 @@ def install_and_restart():
os.execv(sys.executable, ['python'] + sys.argv)

except subprocess.CalledProcessError as e:
print("########################################################")
print(f"Failed to install F5-TTS: {str(e)}")
print("########################################################")
raise ImportError("Could not install required package F5-TTS")

try:
Expand Down Expand Up @@ -152,6 +158,9 @@ def __init__(self):
self.debug_tts = configfile_data.get("debugging").get("debug_tts") # Can be used within this script as a True/False flag for generally debugging the TTS generation process.
self.debug_tts_variables = configfile_data.get("debugging").get("debug_tts_variables") # Can be used within this script as a True/False flag for generally debugging variables (if you wish).

############################################################################
# DONT CHANGE # These settings are specific to the F5-TTS Model/Engine ####
############################################################################
# Add F5-TTS specific parameters
self.target_sample_rate = 24000
self.n_mel_channels = 100
Expand Down Expand Up @@ -444,16 +453,10 @@ def has_reference_text(wav_path):

#################################################################################
#################################################################################
# CHANGE ME # Model loading # Piper does not actually load/stay resident in RAM #
# CHANGE ME # Model loading #####################################################
#################################################################################
#################################################################################
# This function will handle the loading of your model, into VRAM/CUDA, System RAM or whatever.
# In XTTS, which has 2x model loader types, there are 2x loaders. They are called by "def handle_tts_method_change"
# In Piper we fake a model loader as Piper doesnt actually load a model into CUDA/System RAM as such. So, in that
# situation, api_manual_load_model is kind of a blank function. Though we do set self.is_tts_model_loaded = True
# as this is used elsewhere in the scripts to confirm that a model is available to be used for TTS generation.
# We always check for "No Models Available" being sent as that means we are trying to load in a model that
# doesnt exist/wasnt found on script start-up e.g. someone deleted the model from the folder or something.
async def api_manual_load_model(self, model_name):
if model_name == "No Models Found":
print(f"[{self.branding}ENG] \033[91mError\033[0m: No models for this TTS engine were found to load.")
Expand Down Expand Up @@ -567,11 +570,6 @@ async def api_manual_load_model(self, model_name):
###############################
###############################
# This function will handle the UN-loading of your model, from VRAM/CUDA, System RAM or whatever.
# In XTTS, that model loads into CUDA/System Ram, so when we swap models, we want to unload the current model
# free up the memory and then load in the new model to VRAM/CUDA. On the flip side of that, Piper doesnt
# doesnt load into memory, so we just need to put a fake function here that doesnt really do anything
# other than set "self.is_tts_model_loaded = False", which would be set back to true by the model loader.
# So look at the Piper model_engine.py if you DONT need to unload models.
async def unload_model(self):
# ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
# ↑↑↑ Keep everything above this line ↑↑↑
Expand All @@ -597,15 +595,11 @@ async def unload_model(self):

###################################################################################################################################
###################################################################################################################################
# CHANGE ME # Model changing. Unload out old model and load in a new one # XTTS is very unusal as it has 2x model loading methods #
# CHANGE ME # Model changing. Unload out old model and load in a new one ##########################################################
###################################################################################################################################
###################################################################################################################################
# This function is your central model loading/unloading handler that deals with the above functions as necesary, to call loading, unloading,
# swappng DeepSpeed, Low vram etc. This function gets called with a "engine name - model name" type call. In XTTS, because there are 2x
# model loader types, (XTTS and APILocal), we take tts_method and split the "engine name - model name" into a loader type and the model
# that it needs to load in and then we call the correct loader function. Whereas in Piper, which doesnt load models into memory at all,
# we just have a fake function that doesnt really do anything. We always check to see if the model name has "No Models Available" in the
# name thats sent over, just to catch any potential errors. We display the start load time and end load time. Thats about it.
# swappng DeepSpeed, Low vram etc. This function gets called with a "engine name - model name" type call.
async def handle_tts_method_change(self, tts_method):
generate_start_time = time.time() # Record the start time of loading the model
if "No Models Available" in self.available_models:
Expand Down Expand Up @@ -887,9 +881,9 @@ def chunk_text(self, text, max_chars=135):

return chunks

##########################################################################################################################################
##########################################################################################################################################
# CHANGE ME # Model changing. Unload out old model and load in a new one # XTTS is very unusal as it has 2x model TTS generation methods #
##########################################################################################################################################
# CHANGE ME # Model changing. Unload out old model and load in a new one #################################################################
##########################################################################################################################################
##########################################################################################################################################
# In here all the possible options are sent over (text, voice to use, lanugage, speed etc etc) and its up to you how you use them, or not.
Expand All @@ -904,9 +898,7 @@ def chunk_text(self, text, max_chars=135):
# we have to have an option for Streaming, even if our TTS engine doesnt support streaming. So in that case, we would set streaming_capable
# as false in our model_settings.JSON file, meaning streaming will never be called. However, we have to put a fake streaming routine in our
# function below (or a real function if it does support streaming of course). Parler has an example of a fake streaming function, which is
# very clearly highlighted in its model_engine.py script.
# Piper TTS, which uses command line based calls and therefore has different ones for Windows and Linux/Mac, has an example of doing this
# within its model_engine.py file.
# very clearly highlighted in its model_engine.py script.
async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming):
if voice == "No Voices Found":
print(f"[{self.branding}ENG] \033[91mError\033[0m: No voices found to generate TTS.")
Expand Down
6 changes: 3 additions & 3 deletions system/tts_engines/f5tts/model_settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"model_details": {
"manufacturer_name": "F5-TTS",
"manufacturer_website": "https://github.com/SWivid/F5-TTS",
"model_description": "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
"model_description": "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching. Currently F5-TTS only supports English and Chinese languages. For more information, please look at the developers website (link above). Authors (Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen). Journal (arXiv preprint arXiv:2410.06885)"
},
"model_capabilties": {
"audio_format": "wav",
Expand All @@ -25,8 +25,8 @@
"def_narrator_voice": "female_01.wav",
"deepspeed_enabled": false,
"engine_installed": true,
"generationspeed_set": 1,
"lowvram_enabled": false,
"generationspeed_set": 0.9,
"lowvram_enabled": true,
"pitch_set": 0,
"repetitionpenalty_set": 10,
"temperature_set": 0.75
Expand Down

0 comments on commit 5d43b23

Please sign in to comment.