docs: info on python version and min. gpu memory (metavoiceio#34)

* docs: info on python version and min. gpu mem * revert: edit * remove: duplicate * remove: duplicate * revert: to union typing * update: python versions --------- Co-authored-by: sid <sid@themetavoice.xyz>
Pkpk11 · Feb 10, 2024 · 7282924 · 7282924
1 parent 0df797a
commit 7282924
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,10 @@ We’re releasing MetaVoice-1B under the Apache 2.0 license, *it can be used wit
 
 Try out the [demo](https://ttsdemo.themetavoice.xyz/)!
 
-## Installation
+## Installation  
+
+**Pre-requisites:** Python >=3.10,<3.12; GPU with >=24GB RAM.
+
 ```bash
 # install ffmpeg
 wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz

diff --git a/fam/llm/decoders.py b/fam/llm/decoders.py
@@ -2,7 +2,7 @@
 import pathlib
 import uuid
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import julius
 import torch
@@ -63,7 +63,7 @@ def get_tokens(self, audio_path: str) -> list[list[int]]:
 
     def decode(
         self, tokens: list[list[int]], causal: bool = True, ref_audio_path: Optional[str] = None
-    ) -> str | torch.Tensor:
+    ) -> Union[str, torch.Tensor]:
         # TODO: this has strange behaviour -- if causal is True, it returns tokens. if causal is False, it SAVES the audio file.
         text_ids, extracted_audio_ids = self._data_adapter_fn(tokens)
         text = self.tokeniser_decode_fn(text_ids)

diff --git a/fam/llm/sample.py b/fam/llm/sample.py
@@ -8,7 +8,7 @@
 import tempfile
 from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import List, Literal, Optional, Type
+from typing import List, Literal, Optional, Type, Union
 
 import librosa
 import torch
@@ -452,7 +452,7 @@ def _sample_utterance_batch(
     spkemb_model,
     first_stage_model,
     second_stage_model,
-    enhancer: Optional[Literal["df"] | BaseEnhancer],
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
     first_stage_ckpt_path: str,
     second_stage_ckpt_path: str,
     guidance_scale: Optional[float],
@@ -530,7 +530,7 @@ def sample_utterance(
     spkemb_model,
     first_stage_model,
     second_stage_model,
-    enhancer: Optional[Literal["df"] | BaseEnhancer],
+    enhancer: Optional[Union[Literal["df"], BaseEnhancer]],
     first_stage_ckpt_path: str,
     second_stage_ckpt_path: str,
     guidance_scale: Optional[float],

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,9 @@
+[project]
+name = "metavoice"
+version = "0.1.0"
+description = "Foundational model for text to speech"
+requires-python = ">=3.10,<3.12"
+
 [tool.black]
 line-length = 120
 exclude = '''
@@ -12,4 +18,4 @@ exclude = '''
 '''
 
 [tool.isort]
-profile = "black" 
+profile = "black"
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ librosa
 tqdm
 tiktoken==0.5.1
 audiocraft
-numpy<1.25
+numpy
 ninja
 flash-attn
 fastapi