F5-TTS Engine/Settings changes

erew123 · Nov 4, 2024 · 5d43b23 · 5d43b23
1 parent bcd84a4
commit 5d43b23
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 32 deletions.
diff --git a/system/tts_engines/f5tts/f5tts_settings_page.py b/system/tts_engines/f5tts/f5tts_settings_page.py
@@ -122,7 +122,7 @@ def f5tts_model_alltalk_settings(model_config_data):
                 temperature_set_gr = gr.Slider(value=float(model_config_data["settings"]["temperature_set"]), minimum=0, maximum=1, step=0.05, label="Temperature" if model_config_data["model_capabilties"]["temperature_capable"] else "Temperature N/A", interactive=model_config_data["model_capabilties"]["temperature_capable"])
                 repetitionpenalty_set_gr = gr.Slider(value=float(model_config_data["settings"]["repetitionpenalty_set"]), minimum=1, maximum=20, step=1, label="Repetition Penalty" if model_config_data["model_capabilties"]["repetitionpenalty_capable"] else "Repetition N/A", interactive=model_config_data["model_capabilties"]["repetitionpenalty_capable"])
                 pitch_set_gr = gr.Slider(value=float(model_config_data["settings"]["pitch_set"]), minimum=-10, maximum=10, step=1, label="Pitch" if model_config_data["model_capabilties"]["pitch_capable"] else "Pitch N/A", interactive=model_config_data["model_capabilties"]["pitch_capable"])
-                generationspeed_set_gr = gr.Slider(value=float(model_config_data["settings"]["generationspeed_set"]), minimum=0.25, maximum=2.00, step=0.25, label="Speed" if model_config_data["model_capabilties"]["generationspeed_capable"] else "Speed N/A", interactive=model_config_data["model_capabilties"]["generationspeed_capable"])
+                generationspeed_set_gr = gr.Slider(value=float(model_config_data["settings"]["generationspeed_set"]), minimum=0.30, maximum=2.00, step=0.10, label="Speed" if model_config_data["model_capabilties"]["generationspeed_capable"] else "Speed N/A", interactive=model_config_data["model_capabilties"]["generationspeed_capable"])
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### OpenAI Voice Mappings")
@@ -467,13 +467,13 @@ def refresh_file_list():
             with gr.Row():
                 gr.Markdown("""
                     ### 🟧 Where are the f5tts models stored?
-                    This extension will download the models to `/alltalk_tts/models/f5tts/` folder.
+                    This extension will download the f5tts models to `/alltalk_tts/models/f5tts/` folder.
                     
-                    ### 🟧 How do reference voices work?
+                    ### 🟧 How do clone/reference voices work?
                     F5-TTS uses voice samples with corresponding reference text files for voice cloning:
                     1. Place your WAV voice samples in the `/alltalk_tts/voices/` folder
-                    2. Create a matching `.reference.txt` file containing the exact text spoken in the recording
-                    3. Use the Reference Text Manager tab to create and manage these text files
+                    2. Create a matching `.reference.txt` file containing the exact text spoken in the recording.
+                    3. Use the `Reference Text/Sample Manager` tab to create and manage these text files
                     4. Only voice samples with valid reference text files will be available for use
                     
                     For best results:
@@ -487,7 +487,13 @@ def refresh_file_list():
                     > When `Disabled`, your output wav files will be left untouched.
                     > When set to a setting `1 Day` or greater, your output wav files older than that time period will be automatically deleted on start-up of AllTalk.
                     """)
-
+                gr.Markdown("""
+                    ### 🟧 Speed settings and voice cloning
+                    F5-TTS appears to generate audio a little faster than the audio sample. As such, in AllTalk, the speed setting for generation is set to 0.9 (see the F5-TTS `Default Settings` tab). Certainly the better the punctuation you use, the better the end result, both in the audio samples reference file and the text you send to be generated as TTS. If you wish to globally adjust this, you can make the change on the Default Settings tab.
+                    
+                    ### 🟧 Cloned audio and punctuation.
+                    When F5-TTS reproduces audio, punctuation appears to matter greatly. As such `Im` and `I'm` or `I am` will all sound a little different. Additionally it appears to want to spell out text in CAPS. You may want to look at the F5-TTS developers page for more information on this.
+                    """)
 
     return app
 

diff --git a/system/tts_engines/f5tts/model_engine.py b/system/tts_engines/f5tts/model_engine.py
@@ -39,15 +39,19 @@
 
 def install_and_restart():
     try:
+        print("##########################################")
         print("F5-TTS not found. Attempting to install...")
+        print("##########################################")
         subprocess.check_call([
             sys.executable, 
             "-m", 
             "pip", 
             "install", 
             "git+https://github.com/SWivid/F5-TTS.git"
         ])
+        print("########################################################")
         print("F5-TTS installed successfully! Restarting application...")
+        print("########################################################")
 
         # Get the current script's path
         script_path = sys.argv[0]
@@ -56,7 +60,9 @@ def install_and_restart():
         os.execv(sys.executable, ['python'] + sys.argv)
 
     except subprocess.CalledProcessError as e:
+        print("########################################################")
         print(f"Failed to install F5-TTS: {str(e)}")
+        print("########################################################")
         raise ImportError("Could not install required package F5-TTS")
 
 try:
@@ -152,6 +158,9 @@ def __init__(self):
         self.debug_tts = configfile_data.get("debugging").get("debug_tts")                                  # Can be used within this script as a True/False flag for generally debugging the TTS generation process. 
         self.debug_tts_variables = configfile_data.get("debugging").get("debug_tts_variables")              # Can be used within this script as a True/False flag for generally debugging variables (if you wish).
 
+        ############################################################################
+        # DONT CHANGE #  These settings are specific to the F5-TTS Model/Engine ####
+        ############################################################################
         # Add F5-TTS specific parameters
         self.target_sample_rate = 24000
         self.n_mel_channels = 100
@@ -444,16 +453,10 @@ def has_reference_text(wav_path):
 
     #################################################################################
     #################################################################################
-    # CHANGE ME # Model loading # Piper does not actually load/stay resident in RAM #
+    # CHANGE ME # Model loading #####################################################
     #################################################################################
     #################################################################################
     # This function will handle the loading of your model, into VRAM/CUDA, System RAM or whatever.
-    # In XTTS, which has 2x model loader types, there are 2x loaders. They are called by "def handle_tts_method_change"
-    # In Piper we fake a model loader as Piper doesnt actually load a model into CUDA/System RAM as such. So, in that
-    # situation, api_manual_load_model is kind of a blank function. Though we do set self.is_tts_model_loaded = True
-    # as this is used elsewhere in the scripts to confirm that a model is available to be used for TTS generation.
-    # We always check for "No Models Available" being sent as that means we are trying to load in a model that 
-    # doesnt exist/wasnt found on script start-up e.g. someone deleted the model from the folder or something.
     async def api_manual_load_model(self, model_name):
         if model_name == "No Models Found":
             print(f"[{self.branding}ENG] \033[91mError\033[0m: No models for this TTS engine were found to load.")
@@ -567,11 +570,6 @@ async def api_manual_load_model(self, model_name):
         ###############################
         ###############################
         # This function will handle the UN-loading of your model, from VRAM/CUDA, System RAM or whatever.
-        # In XTTS, that model loads into CUDA/System Ram, so when we swap models, we want to unload the current model
-        # free up the memory and then load in the new model to VRAM/CUDA. On the flip side of that, Piper doesnt
-        # doesnt load into memory, so we just need to put a fake function here that doesnt really do anything
-        # other than set "self.is_tts_model_loaded = False", which would be set back to true by the model loader. 
-        # So look at the Piper model_engine.py if you DONT need to unload models.
         async def unload_model(self):
             # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
             # ↑↑↑ Keep everything above this line ↑↑↑
@@ -597,15 +595,11 @@ async def unload_model(self):
 
     ###################################################################################################################################    
     ###################################################################################################################################
-    # CHANGE ME # Model changing. Unload out old model and load in a new one # XTTS is very unusal as it has 2x model loading methods #
+    # CHANGE ME # Model changing. Unload out old model and load in a new one ##########################################################
     ###################################################################################################################################
     ###################################################################################################################################
     # This function is your central model loading/unloading handler that deals with the above functions as necesary, to call loading, unloading,
-    # swappng DeepSpeed, Low vram etc. This function gets called with a "engine name - model name" type call. In XTTS, because there are 2x
-    # model loader types, (XTTS and APILocal), we take tts_method and split the "engine name - model name" into a loader type and the model
-    # that it needs to load in and then we call the correct loader function. Whereas in Piper, which doesnt load models into memory at all, 
-    # we just have a fake function that doesnt really do anything. We always check to see if the model name has "No Models Available" in the
-    # name thats sent over, just to catch any potential errors. We display the start load time and end load time. Thats about it.
+    # swappng DeepSpeed, Low vram etc. This function gets called with a "engine name - model name" type call.
     async def handle_tts_method_change(self, tts_method):
         generate_start_time = time.time() # Record the start time of loading the model
         if "No Models Available" in self.available_models:
@@ -887,9 +881,9 @@ def chunk_text(self, text, max_chars=135):
 
         return chunks
 
-    ##########################################################################################################################################    
     ##########################################################################################################################################
-    # CHANGE ME # Model changing. Unload out old model and load in a new one # XTTS is very unusal as it has 2x model TTS generation methods #
+    ##########################################################################################################################################
+    # CHANGE ME # Model changing. Unload out old model and load in a new one #################################################################
     ##########################################################################################################################################
     ##########################################################################################################################################
     # In here all the possible options are sent over (text, voice to use, lanugage, speed etc etc) and its up to you how you use them, or not.
@@ -904,9 +898,7 @@ def chunk_text(self, text, max_chars=135):
     # we have to have an option for Streaming, even if our TTS engine doesnt support streaming. So in that case, we would set streaming_capable
     # as false in our model_settings.JSON file, meaning streaming will never be called. However, we have to put a fake streaming routine in our
     # function below (or a real function if it does support streaming of course). Parler has an example of a fake streaming function, which is
-    # very clearly highlighted in its model_engine.py script.
-    # Piper TTS, which uses command line based calls and therefore has different ones for Windows and Linux/Mac, has an example of doing this
-    # within its model_engine.py file.     
+    # very clearly highlighted in its model_engine.py script.   
     async def generate_tts(self, text, voice, language, temperature, repetition_penalty, speed, pitch, output_file, streaming):
         if voice == "No Voices Found":
             print(f"[{self.branding}ENG] \033[91mError\033[0m: No voices found to generate TTS.")

diff --git a/system/tts_engines/f5tts/model_settings.json b/system/tts_engines/f5tts/model_settings.json
@@ -2,7 +2,7 @@
     "model_details": {
         "manufacturer_name": "F5-TTS",
         "manufacturer_website": "https://github.com/SWivid/F5-TTS",
-        "model_description": "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
+        "model_description": "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching. Currently F5-TTS only supports English and Chinese languages. For more information, please look at the developers website (link above). Authors (Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen). Journal (arXiv preprint arXiv:2410.06885)"
     },
     "model_capabilties": {
         "audio_format": "wav",
@@ -25,8 +25,8 @@
         "def_narrator_voice": "female_01.wav",
         "deepspeed_enabled": false,
         "engine_installed": true,
-        "generationspeed_set": 1,
-        "lowvram_enabled": false,
+        "generationspeed_set": 0.9,
+        "lowvram_enabled": true,
         "pitch_set": 0,
         "repetitionpenalty_set": 10,
         "temperature_set": 0.75