Skip to content

Commit

Permalink
Added colab notebook on repo (metavoiceio#78)
Browse files Browse the repository at this point in the history
* Add END_OF_AUDIO_TOKEN constant and use it in TTS class

* Refactor normalize_text function to handle whitespace and special characters

* Update links in README.md
  • Loading branch information
shhossain authored Feb 29, 2024
1 parent 3758971 commit 001b82f
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 13 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


<p>
<a href="https://ttsdemo.themetavoice.xyz/"><b>Playground</b></a> | <a target="_blank" style="display: inline-block; vertical-align: middle" href="https://colab.research.google.com/drive/1UmjE1mzfG4td0rCjJEaAWGQXpn_GuwwY?authuser=0#scrollTo=mPgTfUdBJF1B">
<a href="https://ttsdemo.themetavoice.xyz/"><b>Playground</b></a> | <a target="_blank" style="display: inline-block; vertical-align: middle" href="https://colab.research.google.com/github/metavoiceio/metavoice-src/blob/main/colab_demo.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
</p>
Expand Down Expand Up @@ -72,7 +72,7 @@ python app.py
```

3. Use it via [Hugging Face](https://huggingface.co/metavoiceio)
4. [Google Collab](https://colab.research.google.com/drive/1UmjE1mzfG4td0rCjJEaAWGQXpn_GuwwY?authuser=0#scrollTo=mPgTfUdBJF1B)
4. [Google Collab Demo](https://colab.research.google.com/github/metavoiceio/metavoice-src/blob/main/colab_demo.ipynb)


## Upcoming
Expand Down
109 changes: 109 additions & 0 deletions colab_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Installation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Clone the repository"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"!git clone https://github.com/metavoiceio/metavoice-src.git\n",
"%cd metavoice-src"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Install dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"!pip install -r requirements.txt\n",
"!pip install --upgrade torch torchaudio"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"from IPython.display import Audio, display\n",
"from fam.llm.fast_inference import TTS\n",
"\n",
"tts = TTS()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"wav_file = tts.synthesise(\n",
" text=\"This is a demo of text to speech by MetaVoice-1B, an open-source foundational audio model.\",\n",
" spk_ref_path=\"assets/bria.mp3\" # you can use any speaker reference file (WAV, OGG, MP3, FLAC, etc.)\n",
")\n",
"display(Audio(wav_file, autoplay=True))"
]
}
],
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 5 additions & 3 deletions fam/llm/fast_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@


class TTS:
END_OF_AUDIO_TOKEN = 1024

def __init__(
self, model_name: str = "metavoiceio/metavoice-1B-v0.1", *, seed: int = 1337, output_dir: str = "outputs"
):
Expand All @@ -42,7 +44,7 @@ def __init__(
self._dtype = get_default_dtype()
self._device = get_device()
self._model_dir = snapshot_download(repo_id=model_name)
self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=self.END_OF_AUDIO_TOKEN)
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)

Expand All @@ -57,7 +59,7 @@ def __init__(
init_from="resume",
output_dir=self.output_dir,
)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=self.END_OF_AUDIO_TOKEN)
self.llm_second_stage = Model(
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
)
Expand Down Expand Up @@ -103,7 +105,7 @@ def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.
guidance_scale=torch.tensor(guidance_scale, device=self._device, dtype=self.precision),
temperature=torch.tensor(temperature, device=self._device, dtype=self.precision),
)
text_ids, extracted_audio_ids = self.first_stage_adapter.decode([tokens])
_, extracted_audio_ids = self.first_stage_adapter.decode([tokens])

b_speaker_embs = spk_emb.unsqueeze(0)

Expand Down
5 changes: 1 addition & 4 deletions fam/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ def normalize_text(text: str) -> str:
non_bpe_points = [(c, ord(c)) for c in non_bpe_chars]
raise ValueError(f"Non-supported character found: {non_bpe_points}")

text = text.replace("\t", " ")
text = text.replace("\n", " ")
text = text.replace("*", " ")
text = text.strip()
text = text.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("*", " ").strip()
text = re.sub("\s\s+", " ", text) # remove multiple spaces
return text

Expand Down
7 changes: 3 additions & 4 deletions fam/quantiser/audio/speaker_encoder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,12 @@ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_

mel = audio.wav_to_mel_spectrogram(wav)
mels = np.array([mel[s] for s in mel_slices])
mels = torch.from_numpy(mels).to(self.device) # type: ignore
with torch.no_grad():
mels = torch.from_numpy(mels).to(self.device) # type: ignore
partial_embeds = self(mels)

if numpy:
partial_embeds = partial_embeds.cpu().numpy()
raw_embed = np.mean(partial_embeds, axis=0)
raw_embed = np.mean(partial_embeds.cpu().numpy(), axis=0)
embed = raw_embed / np.linalg.norm(raw_embed, 2)
else:
raw_embed = partial_embeds.mean(dim=0)
Expand All @@ -111,7 +110,7 @@ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
return raw_embed / np.linalg.norm(raw_embed, 2)

def embed_utterance_from_file(self, fpath: str, numpy: bool) -> torch.Tensor:
wav_tgt, _ = librosa.load(fpath, sr=16000)
wav_tgt, _ = librosa.load(fpath, sr=sampling_rate)
wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)

embedding = self.embed_utterance(wav_tgt, numpy=numpy)
Expand Down

0 comments on commit 001b82f

Please sign in to comment.