Skip to content

Commit

Permalink
Update dependencies, support Python 3.12, update for exl2 0.1.5 (ther…
Browse files Browse the repository at this point in the history
…oyallab#134)

* Dependencies: Add wheels for Python 3.12

* Model: Switch fp8 cache to Q8 cache

* Model: Add ability to set draft model cache mode

* Dependencies: Bump exllamav2 to 0.1.5

* Model: Support Q6 cache

* Config: Add Q6 cache and draft_cache_mode to config sample
  • Loading branch information
DocShotgun authored Jun 9, 2024
1 parent dcd9428 commit 55d979b
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 33 deletions.
50 changes: 41 additions & 9 deletions backends/exllamav2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Cache_Q4,
ExLlamaV2Cache_Q6,
ExLlamaV2Cache_Q8,
ExLlamaV2Tokenizer,
ExLlamaV2Lora,
)
Expand Down Expand Up @@ -60,6 +61,7 @@ class ExllamaV2Container:
# Internal config vars
cache_size: int = None
cache_mode: str = "FP16"
draft_cache_mode: str = "FP16"
max_batch_size: int = 20
generation_config: Optional[GenerationConfig] = None

Expand Down Expand Up @@ -91,7 +93,7 @@ def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
def progress(loaded_modules: int, total_modules: int,
loading_draft: bool)
**kwargs:
`cache_mode` (str): Sets cache mode, "FP16" or "FP8"
`cache_mode` (str): Sets cache mode: "FP16"/"Q8"/"Q6"/"Q4"
(default: "FP16")
'max_seq_len' (int): Override model's default max sequence
length (default: 4096)
Expand All @@ -116,6 +118,8 @@ def progress(loaded_modules: int, total_modules: int,
model. By default, the draft model's alpha value is
calculated automatically to scale to the size of the
full model.
'draft_cache_mode' (str): Sets draft cache mode: "FP16"/"Q8"/"Q6"/"Q4"
(default: "FP16")
'lora_dir' (str): LoRA directory
'loras' (list[dict]): List of loras to be loaded, consisting of
'name' and 'scaling'
Expand Down Expand Up @@ -373,6 +377,7 @@ def progress(loaded_modules: int, total_modules: int,
self.calculate_rope_alpha(self.draft_config.max_seq_len),
)
self.draft_config.max_seq_len = self.config.max_seq_len
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")

if chunk_size:
self.draft_config.max_input_len = chunk_size
Expand Down Expand Up @@ -460,6 +465,7 @@ def get_model_parameters(self):
"rope_scale": self.draft_config.scale_pos_emb,
"rope_alpha": self.draft_config.scale_alpha_value,
"max_seq_len": self.draft_config.max_seq_len,
"cache_mode": self.draft_cache_mode,
}

model_params["draft"] = draft_model_params
Expand Down Expand Up @@ -571,11 +577,30 @@ def progress(loaded_modules: int, total_modules: int)
if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir)

self.draft_cache = ExLlamaV2Cache(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
if self.draft_cache_mode == "Q4":
self.draft_cache = ExLlamaV2Cache_Q4(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
elif self.draft_cache_mode == "Q6":
self.draft_cache = ExLlamaV2Cache_Q6(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
elif self.draft_cache_mode == "Q8":
self.draft_cache = ExLlamaV2Cache_Q8(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
else:
self.draft_cache = ExLlamaV2Cache(
self.draft_model,
max_seq_len=self.cache_size,
lazy=True,
)
for value in self.draft_model.load_autosplit_gen(
self.draft_cache,
reserve_vram=autosplit_reserve,
Expand Down Expand Up @@ -612,8 +637,15 @@ def progress(loaded_modules: int, total_modules: int)
lazy=self.gpu_split_auto,
batch_size=1,
)
elif self.cache_mode == "FP8":
self.cache = ExLlamaV2Cache_8bit(
elif self.cache_mode == "Q6":
self.cache = ExLlamaV2Cache_Q6(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
batch_size=1,
)
elif self.cache_mode == "Q8":
self.cache = ExLlamaV2Cache_Q8(
self.model,
max_seq_len=self.cache_size,
lazy=self.gpu_split_auto,
Expand Down
2 changes: 1 addition & 1 deletion backends/exllamav2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def check_exllama_version():
"""Verifies the exllama version"""

required_version = version.parse("0.1.4")
required_version = version.parse("0.1.5")
current_version = version.parse(package_version("exllamav2").split("+")[0])

if current_version < required_version:
Expand Down
6 changes: 5 additions & 1 deletion config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ model:
#rope_alpha: 1.0

# Enable different cache modes for VRAM savings (slight performance hit).
# Possible values FP16, FP8, Q4. (default: FP16)
# Possible values FP16, Q8, Q6, Q4. (default: FP16)
#cache_mode: FP16

# Size of the prompt cache to allocate (default: max_seq_len)
Expand Down Expand Up @@ -152,6 +152,10 @@ model:
# Same thing as alpha_value
# Leave blank to automatically calculate alpha value
#draft_rope_alpha: 1.0

# Enable different draft model cache modes for VRAM savings (slight performance hit).
# Possible values FP16, Q8, Q6, Q4. (default: FP16)
#draft_cache_mode: FP16

# Options for loras
#lora:
Expand Down
1 change: 1 addition & 0 deletions endpoints/OAI/types/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class DraftModelLoadRequest(BaseModel):
default=None,
examples=[1.0],
)
draft_cache_mode: Optional[str] = "FP16"


class ModelLoadRequest(BaseModel):
Expand Down
58 changes: 36 additions & 22 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,54 +47,68 @@ dev = [
]
cu121 = [
# Torch (Extra index URLs not support in pyproject.toml)
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

# Exl2
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu121.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu121.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

# Windows FA2 from https://github.com/bdashore3/flash-attention/releases
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3.1cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",

# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
]
cu118 = [
# Torch
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

# Exl2
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+cu118.torch2.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+cu118.torch2.3.1-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
]
amd = [
# Torch triton for ROCm
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
"pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-2.2.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",

# Torch
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.0%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.3.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",

# Exl2
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.4/exllamav2-0.1.4+rocm6.0.torch2.3.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.1.5/exllamav2-0.1.5+rocm6.0.torch2.3.1-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
]

# MARK: Ruff options
Expand Down

0 comments on commit 55d979b

Please sign in to comment.