feat: add int4 and int8 weight-only quantisation (metavoiceio#95)

* feat: add int4 and int8 quantisation * formatting fixes * update README * update * fix pr comments * add comment * add comment --------- Co-authored-by: EC2 Default User <ec2-user@ip-172-31-30-234.eu-west-1.compute.internal>
hima997 · Mar 15, 2024 · af516be · af516be
1 parent e2a9c84
commit af516be
Show file tree

Hide file tree

Showing 6 changed files with 431 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -60,6 +60,8 @@ poetry install && poetry run pip install torch==2.2.1 torchaudio==2.2.1
 ## Usage
 1. Download it and use it anywhere (including locally) with our [reference implementation](/fam/llm/fast_inference.py)
 ```bash
+# You can use `--quantisation_mode int4` or `--quantisation_mode int8` for experimental faster inference.  This will degrade the quality of the audio. 
+# Note: int8 is slower than bf16/fp16 for undebugged reasons. If you want fast, try int4 which is roughly 2x faster than bf16/fp16.
 poetry run python -i fam/llm/fast_inference.py
 
 # Run e.g. of API usage within the interactive python session
@@ -71,6 +73,8 @@ tts.synthesise(text="This is a demo of text to speech by MetaVoice-1B, an open-s
 
 2. Deploy it on any cloud (AWS/GCP/Azure), using our [inference server](serving.py) or [web UI](app.py)
 ```bash
+# You can use `--quantisation_mode int4` or `--quantisation_mode int8` for experimental faster inference. This will degrade the quality of the audio. 
+# Note: int8 is slower than bf16/fp16 for undebugged reasons. If you want fast, try int4 which is roughly 2x faster than bf16/fp16.
 poetry run python serving.py
 poetry run python app.py
 ```

diff --git a/app.py b/app.py
@@ -7,12 +7,13 @@
 
 
 import gradio as gr
+import tyro
 
 from fam.llm.fast_inference import TTS
 from fam.llm.utils import check_audio_file
 
 #### setup model
-TTS_MODEL = TTS()
+TTS_MODEL = tyro.cli(TTS)
 
 #### setup interface
 RADIO_CHOICES = ["Preset voices", "Upload target voice (atleast 30s)"]

diff --git a/fam/llm/fast_inference.py b/fam/llm/fast_inference.py
@@ -3,9 +3,11 @@
 import tempfile
 import time
 from pathlib import Path
+from typing import Literal, Optional
 
 import librosa
 import torch
+import tyro
 from huggingface_hub import snapshot_download
 
 from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
@@ -33,10 +35,25 @@ class TTS:
     END_OF_AUDIO_TOKEN = 1024
 
     def __init__(
-        self, model_name: str = "metavoiceio/metavoice-1B-v0.1", *, seed: int = 1337, output_dir: str = "outputs"
+        self,
+        model_name: str = "metavoiceio/metavoice-1B-v0.1",
+        *,
+        seed: int = 1337,
+        output_dir: str = "outputs",
+        quantisation_mode: Optional[Literal["int4", "int8"]] = None,
     ):
         """
-        model_name (str): refers to the model identifier from the Hugging Face Model Hub (https://huggingface.co/metavoiceio)
+        Initialise the TTS model.
+
+        Args:
+            model_name: refers to the model identifier from the Hugging Face Model Hub (https://huggingface.co/metavoiceio)
+            seed: random seed for reproducibility
+            output_dir: directory to save output files
+            quantisation_mode: quantisation mode for first-stage LLM.
+                Options:
+                - None for no quantisation (bf16 or fp16 based on device),
+                - int4 for int4 weight-only quantisation,
+                - int8 for int8 weight-only quantisation.
         """
 
         # NOTE: this needs to come first so that we don't change global state when we want to use
@@ -73,6 +90,7 @@ def __init__(
             device=self._device,
             compile=True,
             compile_prefill=True,
+            quantisation_mode=quantisation_mode,
         )
 
     def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.0, temperature=1.0) -> str:
@@ -140,4 +158,4 @@ def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.
 
 
 if __name__ == "__main__":
-    tts = TTS()
+    tts = tyro.cli(TTS)
diff --git a/fam/llm/fast_inference_utils.py b/fam/llm/fast_inference_utils.py
@@ -25,14 +25,17 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import itertools
 import time
+import warnings
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 import tqdm
 
+from fam.llm.fast_quantize import WeightOnlyInt4QuantHandler, WeightOnlyInt8QuantHandler
+
 
 def device_sync(device):
     if "cuda" in device:
@@ -230,28 +233,13 @@ def encode_tokens(tokenizer: TrainedBPETokeniser, text: str, device="cuda") -> t
     return torch.tensor(tokens, dtype=torch.int, device=device)
 
 
-def _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision):
+def _load_model(
+    checkpoint_path, spk_emb_ckpt_path, device, precision, quantisation_mode: Optional[Literal["int4", "int8"]] = None
+):
     ##### MODEL
     with torch.device("meta"):
         model = Transformer.from_name("metavoice-1B")
 
-    # TODO(quantization): enable
-    # if "int8" in str(checkpoint_path):
-    #     print("Using int8 weight-only quantization!")
-    #     from quantize import WeightOnlyInt8QuantHandler
-    #     simple_quantizer = WeightOnlyInt8QuantHandler(model)
-    #     model = simple_quantizer.convert_for_runtime()
-    # from quantize import WeightOnlyInt8QuantHandler
-
-    # if "int4" in str(checkpoint_path):
-    #     print("Using int4 quantization!")
-    #     path_comps = checkpoint_path.name.split(".")
-    #     assert path_comps[-2].startswith("g")
-    #     groupsize = int(path_comps[-2][1:])
-    #     from quantize import WeightOnlyInt4QuantHandler
-    #     simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-    #     model = simple_quantizer.convert_for_runtime()
-
     checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=False)
     state_dict = checkpoint["model"]
     # convert MetaVoice-1B model weights naming to gptfast naming
@@ -290,11 +278,34 @@ def _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision):
             k = k.replace(".mlp.c_proj.", ".feed_forward.w2.")
 
     model.load_state_dict(state_dict, assign=True)
-    # simple_quantizer = WeightOnlyInt8QuantHandler(model)
-    # quantized_state_dict = simple_quantizer.create_quantized_state_dict()
-    # model = simple_quantizer.convert_for_runtime()
-    # model.load_state_dict(quantized_state_dict, assign=True)
-    model = model.to(device=device, dtype=precision)
+    model = model.to(device=device, dtype=torch.bfloat16)
+
+    if quantisation_mode == "int8":
+        warnings.warn(
+            "int8 quantisation is slower than bf16/fp16 for undebugged reasons! Please set optimisation_mode to `None` or to `int4`."
+        )
+        warnings.warn(
+            "quantisation will degrade the quality of the audio! Please set optimisation_mode to `None` for best quality."
+        )
+        simple_quantizer = WeightOnlyInt8QuantHandler(model)
+        quantized_state_dict = simple_quantizer.create_quantized_state_dict()
+        model = simple_quantizer.convert_for_runtime()
+        model.load_state_dict(quantized_state_dict, assign=True)
+        model = model.to(device=device, dtype=torch.bfloat16)
+        # TODO: int8/int4 doesn't decrease VRAM usage substantially... fix that (might be linked to kv-cache)
+        torch.cuda.empty_cache()
+    elif quantisation_mode == "int4":
+        warnings.warn(
+            "quantisation will degrade the quality of the audio! Please set optimisation_mode to `None` for best quality."
+        )
+        simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize=128)
+        quantized_state_dict = simple_quantizer.create_quantized_state_dict()
+        model = simple_quantizer.convert_for_runtime(use_cuda=True)
+        model.load_state_dict(quantized_state_dict, assign=True)
+        model = model.to(device=device, dtype=torch.bfloat16)
+        torch.cuda.empty_cache()
+    elif quantisation_mode is not None:
+        raise Exception(f"Invalid quantisation mode {quantisation_mode}! Must be either 'int4' or 'int8'!")
 
     ###### TOKENIZER
     tokenizer_info = checkpoint.get("meta", {}).get("tokenizer", {})
@@ -318,14 +329,17 @@ def build_model(
     compile_prefill: bool = False,
     compile: bool = True,
     device: str = "cuda",
+    quantisation_mode: Optional[Literal["int4", "int8"]] = None,
 ):
     assert checkpoint_path.is_file(), checkpoint_path
 
     print(f"Using device={device}")
 
     print("Loading model ...")
     t0 = time.time()
-    model, tokenizer, smodel = _load_model(checkpoint_path, spk_emb_ckpt_path, device, precision)
+    model, tokenizer, smodel = _load_model(
+        checkpoint_path, spk_emb_ckpt_path, device, precision, quantisation_mode=quantisation_mode
+    )
 
     device_sync(device=device)  # MKG
     print(f"Time to load model: {time.time() - t0:.02f} seconds")