atlonxp-speech
diff --git a/‎README.md
+162-92 b/‎README.md
+162-92
diff --git a/‎check_cuda.py
-23 b/‎check_cuda.py
-23
diff --git a/‎extensions/builtin/extension_whisper/main.py
+50-39 b/‎extensions/builtin/extension_whisper/main.py
+50-39
diff --git a/‎optimize_conda_storage.cmd ‎installer_scripts/optimize_conda_storage.cmd b/‎optimize_conda_storage.cmd ‎installer_scripts/optimize_conda_storage.cmd
diff --git a/‎samples/Bark Japanese.mp4
40.6 KB b/‎samples/Bark Japanese.mp4
40.6 KB
diff --git a/‎samples/Bark Narration.mp4
187 KB b/‎samples/Bark Narration.mp4
187 KB
diff --git a/‎samples/MusicGen.mp4
135 KB b/‎samples/MusicGen.mp4
135 KB
diff --git a/‎samples/audio__bark__continued_generation__2023-05-04_16-07-49_long.wav
-4.1 MB b/‎samples/audio__bark__continued_generation__2023-05-04_16-07-49_long.wav
-4.1 MB
diff --git a/‎samples/audio__bark__continued_generation__2023-05-04_16-09-21_long.wav
-3.89 MB b/‎samples/audio__bark__continued_generation__2023-05-04_16-09-21_long.wav
-3.89 MB
diff --git a/‎samples/audio__bark__continued_generation__2023-05-04_16-10-55_long.wav
-3.82 MB b/‎samples/audio__bark__continued_generation__2023-05-04_16-10-55_long.wav
-3.82 MB
diff --git a/‎screenshots/banner.png
76.6 KB b/‎screenshots/banner.png
76.6 KB
diff --git a/‎screenshots/gradio (1).png
447 KB b/‎screenshots/gradio (1).png
447 KB
diff --git a/‎screenshots/gradio (2).png
450 KB b/‎screenshots/gradio (2).png
450 KB
diff --git a/‎screenshots/gradio (3).png
432 KB b/‎screenshots/gradio (3).png
432 KB
diff --git a/‎screenshots/react_ui (1).png
589 KB b/‎screenshots/react_ui (1).png
589 KB
diff --git a/‎screenshots/react_ui (2).png
570 KB b/‎screenshots/react_ui (2).png
570 KB
diff --git a/‎screenshots/react_ui (3).png
521 KB b/‎screenshots/react_ui (3).png
521 KB
diff --git a/‎screenshots/screenshot (1).png
-236 KB b/‎screenshots/screenshot (1).png
-236 KB
diff --git a/‎screenshots/screenshot (2).png
-95.8 KB b/‎screenshots/screenshot (2).png
-95.8 KB
diff --git a/‎screenshots/screenshot (3).png
-221 KB b/‎screenshots/screenshot (3).png
-221 KB
diff --git a/‎screenshots/screenshot (4).png
-122 KB b/‎screenshots/screenshot (4).png
-122 KB
diff --git a/‎screenshots/screenshot (5).png
-112 KB b/‎screenshots/screenshot (5).png
-112 KB
diff --git a/‎screenshots/screenshot (6).png
-168 KB b/‎screenshots/screenshot (6).png
-168 KB
diff --git a/‎screenshots/screenshot (7).png
-178 KB b/‎screenshots/screenshot (7).png
-178 KB
diff --git a/‎screenshots/screenshot (8).png
-85.2 KB b/‎screenshots/screenshot (8).png
-85.2 KB
diff --git a/‎screenshots/v2/cloning.png
-84.5 KB b/‎screenshots/v2/cloning.png
-84.5 KB
diff --git a/‎screenshots/v2/generation.jpg
-178 KB b/‎screenshots/v2/generation.jpg
-178 KB
diff --git a/‎screenshots/v2/history.jpg
-177 KB b/‎screenshots/v2/history.jpg
-177 KB
diff --git a/‎screenshots/v2/musicgen.png
-192 KB b/‎screenshots/v2/musicgen.png
-192 KB
diff --git a/‎screenshots/v2/react.png
-150 KB b/‎screenshots/v2/react.png
-150 KB
diff --git a/‎screenshots/v2/rvc.png
-142 KB b/‎screenshots/v2/rvc.png
-142 KB
diff --git a/‎update.py
-16 b/‎update.py
-16
@@ -1,10 +1,12 @@
 import gradio as gr
-import gc
 import torch
 import os
 
 from typing import TYPE_CHECKING
 
+from tts_webui.utils.manage_model_state import manage_model_state
+from tts_webui.utils.list_dir_models import unload_model_button
+
 if TYPE_CHECKING:
     from transformers import Pipeline
 
@@ -14,7 +16,7 @@ def extension__tts_generation_webui():
     return {
         "package_name": "extension_whisper",
         "name": "Whisper",
-        "version": "0.0.1",
+        "version": "0.0.2",
         "requirements": "git+https://github.com/rsxdalv/extension_whisper@main",
         "description": "Whisper allows transcribing audio files.",
         "extension_type": "interface",
@@ -28,40 +30,59 @@ def extension__tts_generation_webui():
     }
 
 
-local_dir = os.path.join("data", "models", "whisper")
-local_cache_dir = os.path.join(local_dir, "cache")
+@manage_model_state("whisper")
+def get_model(
+    model_name="openai/whisper-large-v3",
+    torch_dtype=torch.float16,
+    device="cuda:0",
+    compile=False,
+):
+    from transformers import AutoModelForSpeechSeq2Seq
+    from transformers import AutoProcessor
+
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_name, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+    ).to(device)
+    if compile:
+        model.generation_config.cache_implementation = "static"
+        model.generation_config.max_new_tokens = 256
+        model.forward = torch.compile(
+            model.forward, mode="reduce-overhead", fullgraph=True
+        )
 
-pipe = None
-last_model_name = None
+    processor = AutoProcessor.from_pretrained(model_name)
 
+    return model, processor
 
-def unload_models():
-    global pipe, last_model_name
-    pipe = None
-    last_model_name = None
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    return "Unloaded"
 
+local_dir = os.path.join("data", "models", "whisper")
+local_cache_dir = os.path.join(local_dir, "cache")
 
+
+@manage_model_state("whisper-pipe")
 def get_pipe(model_name, device="cuda:0") -> "Pipeline":
     from transformers import pipeline
 
-    global pipe, last_model_name
-    if pipe is not None:
-        if model_name == last_model_name:
-            return pipe
-    unload_models()
-    pipe = pipeline(
-        "automatic-speech-recognition",
+    torch_dtype = torch.float16
+
+    model, processor = get_model(
+        # model_name, torch_dtype=torch.float16, device=device, compile=False
         model_name,
+        torch_dtype=torch_dtype,
+        device=device,
+        compile=False,
+    )
+    return pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        # chunk_length_s=30,
+        # batch_size=16,  # batch size for inference - set based on your device
         torch_dtype=torch.float16,
         model_kwargs={"cache_dir": local_cache_dir},
         device=device,
     )
-    last_model_name = model_name
-    return pipe
 
 
 def transcribe(inputs, model_name="openai/whisper-large-v3"):
@@ -72,13 +93,11 @@ def transcribe(inputs, model_name="openai/whisper-large-v3"):
 
     pipe = get_pipe(model_name)
 
-    generate_kwargs = (
-        {"task": "transcribe"} if model_name == "openai/whisper-large-v3" else {}
-    )
-
     result = pipe(
         inputs,
-        generate_kwargs=generate_kwargs,
+        generate_kwargs=(
+            {"task": "transcribe"} if model_name == "openai/whisper-large-v3" else {}
+        ),
         return_timestamps=True,
     )
     return result["text"]
@@ -108,7 +127,8 @@ def transcribe_ui():
             text = gr.Textbox(label="Transcription", interactive=False)
 
     with gr.Row():
-        unload_models_button = gr.Button("Unload models")
+        unload_model_button("whisper-pipe")
+        unload_model_button("whisper")
 
         transcribe_button = gr.Button("Transcribe", variant="primary")
 
@@ -117,21 +137,12 @@ def transcribe_ui():
         inputs=[audio, model_dropdown],
         outputs=[text],
         api_name="whisper_transcribe",
-    ).then(
-        fn=lambda: gr.Button(value="Unload models"),
-        outputs=[unload_models_button],
-    )
-
-    unload_models_button.click(
-        fn=unload_models,
-        outputs=[unload_models_button],
-        api_name="whisper_unload_models",
     )
 
 
 if __name__ == "__main__":
     if "demo" in locals():
-        demo.close()
+        locals()["demo"].close()
 
     with gr.Blocks() as demo:
         with gr.Tab("Whisper"):