Update dependencies, support Python 3.12, update for exl2 0.1.5 (ther…

…oyallab#134) * Dependencies: Add wheels for Python 3.12 * Model: Switch fp8 cache to Q8 cache * Model: Add ability to set draft model cache mode * Dependencies: Bump exllamav2 to 0.1.5 * Model: Support Q6 cache * Config: Add Q6 cache and draft_cache_mode to config sample
strikeoncmputrz · Jun 9, 2024 · 55d979b · 55d979b
1 parent dcd9428
commit 55d979b
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 33 deletions.
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
@@ -11,8 +11,9 @@
     ExLlamaV2,
     ExLlamaV2Config,
     ExLlamaV2Cache,
-    ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q6,
+    ExLlamaV2Cache_Q8,
     ExLlamaV2Tokenizer,
     ExLlamaV2Lora,
 )
@@ -60,6 +61,7 @@ class ExllamaV2Container:
     # Internal config vars
     cache_size: int = None
     cache_mode: str = "FP16"
+    draft_cache_mode: str = "FP16"
     max_batch_size: int = 20
     generation_config: Optional[GenerationConfig] = None
 
@@ -91,7 +93,7 @@ def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
                 def progress(loaded_modules: int, total_modules: int,
                              loading_draft: bool)
             **kwargs:
-                `cache_mode` (str): Sets cache mode, "FP16" or "FP8"
+                `cache_mode` (str): Sets cache mode: "FP16"/"Q8"/"Q6"/"Q4"
                     (default: "FP16")
                 'max_seq_len' (int): Override model's default max sequence
                     length (default: 4096)
@@ -116,6 +118,8 @@ def progress(loaded_modules: int, total_modules: int,
                     model. By default, the draft model's alpha value is
                     calculated automatically to scale to the size of the
                     full model.
+                'draft_cache_mode' (str): Sets draft cache mode: "FP16"/"Q8"/"Q6"/"Q4"
+                    (default: "FP16")
                 'lora_dir' (str): LoRA directory
                 'loras' (list[dict]): List of loras to be loaded, consisting of
                     'name' and 'scaling'
@@ -373,6 +377,7 @@ def progress(loaded_modules: int, total_modules: int,
                 self.calculate_rope_alpha(self.draft_config.max_seq_len),
             )
             self.draft_config.max_seq_len = self.config.max_seq_len
+            self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
 
             if chunk_size:
                 self.draft_config.max_input_len = chunk_size
@@ -460,6 +465,7 @@ def get_model_parameters(self):
                 "rope_scale": self.draft_config.scale_pos_emb,
                 "rope_alpha": self.draft_config.scale_alpha_value,
                 "max_seq_len": self.draft_config.max_seq_len,
+                "cache_mode": self.draft_cache_mode,
             }
 
             model_params["draft"] = draft_model_params
@@ -571,11 +577,30 @@ def progress(loaded_modules: int, total_modules: int)
             if not self.quiet:
                 logger.info("Loading draft model: " + self.draft_config.model_dir)
 
-            self.draft_cache = ExLlamaV2Cache(
-                self.draft_model,
-                max_seq_len=self.cache_size,
-                lazy=True,
-            )
+            if self.draft_cache_mode == "Q4":
+                self.draft_cache = ExLlamaV2Cache_Q4(
+                    self.draft_model,
+                    max_seq_len=self.cache_size,
+                    lazy=True,
+                )
+            elif self.draft_cache_mode == "Q6":
+                self.draft_cache = ExLlamaV2Cache_Q6(
+                    self.draft_model,
+                    max_seq_len=self.cache_size,
+                    lazy=True,
+                )
+            elif self.draft_cache_mode == "Q8":
+                self.draft_cache = ExLlamaV2Cache_Q8(
+                    self.draft_model,
+                    max_seq_len=self.cache_size,
+                    lazy=True,
+                )
+            else:
+                self.draft_cache = ExLlamaV2Cache(
+                    self.draft_model,
+                    max_seq_len=self.cache_size,
+                    lazy=True,
+                )
             for value in self.draft_model.load_autosplit_gen(
                 self.draft_cache,
                 reserve_vram=autosplit_reserve,
@@ -612,8 +637,15 @@ def progress(loaded_modules: int, total_modules: int)
                 lazy=self.gpu_split_auto,
                 batch_size=1,
             )
-        elif self.cache_mode == "FP8":
-            self.cache = ExLlamaV2Cache_8bit(
+        elif self.cache_mode == "Q6":
+            self.cache = ExLlamaV2Cache_Q6(
+                self.model,
+                max_seq_len=self.cache_size,
+                lazy=self.gpu_split_auto,
+                batch_size=1,
+            )
+        elif self.cache_mode == "Q8":
+            self.cache = ExLlamaV2Cache_Q8(
                 self.model,
                 max_seq_len=self.cache_size,
                 lazy=self.gpu_split_auto,

diff --git a/backends/exllamav2/utils.py b/backends/exllamav2/utils.py
@@ -6,7 +6,7 @@
 def check_exllama_version():
     """Verifies the exllama version"""
 
-    required_version = version.parse("0.1.4")
+    required_version = version.parse("0.1.5")
     current_version = version.parse(package_version("exllamav2").split("+")[0])
 
     if current_version < required_version:

diff --git a/config_sample.yml b/config_sample.yml
@@ -101,7 +101,7 @@ model:
   #rope_alpha: 1.0
 
   # Enable different cache modes for VRAM savings (slight performance hit).
-  # Possible values FP16, FP8, Q4. (default: FP16)
+  # Possible values FP16, Q8, Q6, Q4. (default: FP16)
   #cache_mode: FP16
 
   # Size of the prompt cache to allocate (default: max_seq_len)
@@ -152,6 +152,10 @@ model:
     # Same thing as alpha_value
     # Leave blank to automatically calculate alpha value
     #draft_rope_alpha: 1.0
+
+    # Enable different draft model cache modes for VRAM savings (slight performance hit).
+    # Possible values FP16, Q8, Q6, Q4. (default: FP16)
+    #draft_cache_mode: FP16
 
   # Options for loras
   #lora:

diff --git a/endpoints/OAI/types/model.py b/endpoints/OAI/types/model.py
@@ -53,6 +53,7 @@ class DraftModelLoadRequest(BaseModel):
         default=None,
         examples=[1.0],
     )
+    draft_cache_mode: Optional[str] = "FP16"
 
 
 class ModelLoadRequest(BaseModel):

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,54 +47,68 @@ dev = [
 ]
 cu121 = [
     # Torch (Extra index URLs not support in pyproject.toml)
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
 
     # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 cu118 = [
     # Torch
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
+    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 amd = [
     # Torch triton for ROCm
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
     "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
     "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Torch
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]
 
 # MARK: Ruff options