feat: swap sample speaker reference (metavoiceio#21)

Co-authored-by: sid <sid@themetavoice.xyz>
Pkpk11 · Feb 8, 2024 · 11428f9 · 11428f9
1 parent 11969e5
commit 11428f9
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 MetaVoice-1B is a 1.2B parameter base model trained on 100K hours of speech for TTS (text-to-speech). It has been built with the following priorities:
 * **Emotional speech rhythm and tone** in English. No hallucinations.
+* **Zero-shot cloning for American & British voices**, with 30s reference audio.
 * Support for (cross-lingual) **voice cloning with finetuning**.
   * We have had success with as little as 1 minute training data for Indian speakers.
-* **Zero-shot cloning for American & British voices**, with 30s reference audio.
 * Support for **long-form synthesis**.
 
 We’re releasing MetaVoice-1B under the Apache 2.0 license, *it can be used without restrictions*.
@@ -28,7 +28,7 @@ pip install -e .
 ## Usage
 1. Download it and use it anywhere (including locally) with our [reference implementation](/fam/llm/sample.py),
 ```bash
-python fam/llm/sample.py --huggingface_repo_id="metavoiceio/metavoice-1B-v0.1" --spk_cond_path="assets/ava.flac"
+python fam/llm/sample.py --huggingface_repo_id="metavoiceio/metavoice-1B-v0.1" --spk_cond_path="assets/bria.mp3"
 ```
 
 2. Deploy it on any cloud (AWS/GCP/Azure), using our [inference server](/fam/llm/serving.py)

diff --git a/assets/ava.flac b/assets/ava.flac
diff --git a/assets/bria.mp3 b/assets/bria.mp3
diff --git a/fam/llm/sample.py b/fam/llm/sample.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from typing import List, Literal, Optional, Type
 
+import librosa
 import torch
 import tqdm
 import tqdm.contrib.concurrent
@@ -401,6 +402,7 @@ def get_cached_file(file_or_uri: str):
     """
     is_uri = file_or_uri.startswith("http")
 
+    cache_path = None
     if is_uri:
         ext = pathlib.Path(file_or_uri).suffix
         # hash the file path to get the cache name
@@ -412,14 +414,18 @@ def get_cached_file(file_or_uri: str):
         if not os.path.exists(cache_path):
             command = f"curl -o {cache_path} {file_or_uri}"
             subprocess.run(command, shell=True, check=True)
-
-        return cache_path
     else:
         if os.path.exists(file_or_uri):
-            return file_or_uri
+            cache_path = file_or_uri
         else:
             raise FileNotFoundError(f"File {file_or_uri} not found!")
 
+    # check audio file is at min. 30s in length
+    audio, sr = librosa.load(cache_path)
+    assert librosa.get_duration(y=audio, sr=sr) >= 30, "Speaker reference audio file needs to be >= 30s in duration."
+
+    return cache_path
+
 
 def get_cached_embedding(local_file_path: str, spkemb_model):
     if not os.path.exists(local_file_path):
@@ -596,7 +602,7 @@ class SamplingControllerConfig:
     """Absolute path to the model directory."""
 
     spk_cond_path: str
-    """Path to speaker reference file. Supports: wav, flac & mp3"""
+    """Path to speaker reference file. Min. 30s of audio required. Supports both local paths & public URIs. Audio formats: wav, flac & mp3"""
 
     text: str = (
         "This is a demo of text to speech by MetaVoice-1B, an open-source foundational audio model by MetaVoice."