Added colab notebook on repo (metavoiceio#78)

* Add END_OF_AUDIO_TOKEN constant and use it in TTS class * Refactor normalize_text function to handle whitespace and special characters * Update links in README.md
Pkpk11 · Feb 29, 2024 · 001b82f · 001b82f
1 parent 3758971
commit 001b82f
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 
 <p>
-<a href="https://ttsdemo.themetavoice.xyz/"><b>Playground</b></a> | <a target="_blank" style="display: inline-block; vertical-align: middle" href="https://colab.research.google.com/drive/1UmjE1mzfG4td0rCjJEaAWGQXpn_GuwwY?authuser=0#scrollTo=mPgTfUdBJF1B">
+<a href="https://ttsdemo.themetavoice.xyz/"><b>Playground</b></a> | <a target="_blank" style="display: inline-block; vertical-align: middle" href="https://colab.research.google.com/github/metavoiceio/metavoice-src/blob/main/colab_demo.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a> 
 </p>
@@ -72,7 +72,7 @@ python app.py
 ```
 
 3. Use it via [Hugging Face](https://huggingface.co/metavoiceio)
-4. [Google Collab](https://colab.research.google.com/drive/1UmjE1mzfG4td0rCjJEaAWGQXpn_GuwwY?authuser=0#scrollTo=mPgTfUdBJF1B)
+4. [Google Collab Demo](https://colab.research.google.com/github/metavoiceio/metavoice-src/blob/main/colab_demo.ipynb)
 
 
 ## Upcoming

diff --git a/colab_demo.ipynb b/colab_demo.ipynb
@@ -0,0 +1,109 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Installation"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Clone the repository"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "vscode": {
+                    "languageId": "plaintext"
+                }
+            },
+            "outputs": [],
+            "source": [
+                "!git clone https://github.com/metavoiceio/metavoice-src.git\n",
+                "%cd metavoice-src"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Install dependencies"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "vscode": {
+                    "languageId": "plaintext"
+                }
+            },
+            "outputs": [],
+            "source": [
+                "!pip install -r requirements.txt\n",
+                "!pip install --upgrade torch torchaudio"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Inference"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "vscode": {
+                    "languageId": "plaintext"
+                }
+            },
+            "outputs": [],
+            "source": [
+                "from IPython.display import Audio, display\n",
+                "from fam.llm.fast_inference import TTS\n",
+                "\n",
+                "tts = TTS()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "vscode": {
+                    "languageId": "plaintext"
+                }
+            },
+            "outputs": [],
+            "source": [
+                "wav_file = tts.synthesise(\n",
+                "  text=\"This is a demo of text to speech by MetaVoice-1B, an open-source foundational audio model.\",\n",
+                "  spk_ref_path=\"assets/bria.mp3\" # you can use any speaker reference file (WAV, OGG, MP3, FLAC, etc.)\n",
+                ")\n",
+                "display(Audio(wav_file, autoplay=True))"
+            ]
+        }
+    ],
+    "metadata": {
+        "colab": {
+            "provenance": [],
+            "gpuType": "T4"
+        },
+        "kernelspec": {
+            "name": "python3",
+            "display_name": "Python 3"
+        },
+        "language_info": {
+            "name": "python"
+        },
+        "accelerator": "GPU"
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}
diff --git a/fam/llm/fast_inference.py b/fam/llm/fast_inference.py
@@ -30,6 +30,8 @@
 
 
 class TTS:
+    END_OF_AUDIO_TOKEN = 1024
+
     def __init__(
         self, model_name: str = "metavoiceio/metavoice-1B-v0.1", *, seed: int = 1337, output_dir: str = "outputs"
     ):
@@ -42,7 +44,7 @@ def __init__(
         self._dtype = get_default_dtype()
         self._device = get_device()
         self._model_dir = snapshot_download(repo_id=model_name)
-        self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
+        self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=self.END_OF_AUDIO_TOKEN)
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
 
@@ -57,7 +59,7 @@ def __init__(
             init_from="resume",
             output_dir=self.output_dir,
         )
-        data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
+        data_adapter_second_stage = TiltedEncodec(end_of_audio_token=self.END_OF_AUDIO_TOKEN)
         self.llm_second_stage = Model(
             config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
         )
@@ -103,7 +105,7 @@ def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.
             guidance_scale=torch.tensor(guidance_scale, device=self._device, dtype=self.precision),
             temperature=torch.tensor(temperature, device=self._device, dtype=self.precision),
         )
-        text_ids, extracted_audio_ids = self.first_stage_adapter.decode([tokens])
+        _, extracted_audio_ids = self.first_stage_adapter.decode([tokens])
 
         b_speaker_embs = spk_emb.unsqueeze(0)
 

diff --git a/fam/llm/utils.py b/fam/llm/utils.py
@@ -45,10 +45,7 @@ def normalize_text(text: str) -> str:
         non_bpe_points = [(c, ord(c)) for c in non_bpe_chars]
         raise ValueError(f"Non-supported character found: {non_bpe_points}")
 
-    text = text.replace("\t", " ")
-    text = text.replace("\n", " ")
-    text = text.replace("*", " ")
-    text = text.strip()
+    text = text.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("*", " ").strip()
     text = re.sub("\s\s+", " ", text)  # remove multiple spaces
     return text
 

diff --git a/fam/quantiser/audio/speaker_encoder/model.py b/fam/quantiser/audio/speaker_encoder/model.py
@@ -90,13 +90,12 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_
 
         mel = audio.wav_to_mel_spectrogram(wav)
         mels = np.array([mel[s] for s in mel_slices])
+        mels = torch.from_numpy(mels).to(self.device)  # type: ignore
         with torch.no_grad():
-            mels = torch.from_numpy(mels).to(self.device)  # type: ignore
             partial_embeds = self(mels)
 
         if numpy:
-            partial_embeds = partial_embeds.cpu().numpy()
-            raw_embed = np.mean(partial_embeds, axis=0)
+            raw_embed = np.mean(partial_embeds.cpu().numpy(), axis=0)
             embed = raw_embed / np.linalg.norm(raw_embed, 2)
         else:
             raw_embed = partial_embeds.mean(dim=0)
@@ -111,7 +110,7 @@ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
         return raw_embed / np.linalg.norm(raw_embed, 2)
 
     def embed_utterance_from_file(self, fpath: str, numpy: bool) -> torch.Tensor:
-        wav_tgt, _ = librosa.load(fpath, sr=16000)
+        wav_tgt, _ = librosa.load(fpath, sr=sampling_rate)
         wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
 
         embedding = self.embed_utterance(wav_tgt, numpy=numpy)