Skip to content

Commit ad8807a

Browse files
committed
Model: Add support for num_experts_by_token
New parameter that's safe to edit in exllamav2 v0.0.11. Only recommended for people who know what they're doing. Signed-off-by: kingbri <bdashore3@proton.me>
1 parent 70fbee3 commit ad8807a

File tree

3 files changed

+16
-1
lines changed

3 files changed

+16
-1
lines changed

OAI/types/model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ class ModelCardParameters(BaseModel):
77
max_seq_len: Optional[int] = 4096
88
rope_scale: Optional[float] = 1.0
99
rope_alpha: Optional[float] = 1.0
10-
prompt_template: Optional[str] = None
1110
cache_mode: Optional[str] = "FP16"
11+
prompt_template: Optional[str] = None
12+
num_experts_per_token: Optional[int] = None
1213
draft: Optional['ModelCard'] = None
1314

1415
class ModelCard(BaseModel):
@@ -40,6 +41,7 @@ class ModelLoadRequest(BaseModel):
4041
# low_mem: Optional[bool] = False
4142
cache_mode: Optional[str] = "FP16"
4243
prompt_template: Optional[str] = None
44+
num_experts_per_token: Optional[int] = None
4345
draft: Optional[DraftModelLoadRequest] = None
4446

4547
class ModelLoadResponse(BaseModel):

config_sample.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ model:
6060
# NOTE: Only works with chat completion message lists!
6161
prompt_template:
6262

63+
# Number of experts to use per token. Loads from the model's config.json if not specified (default: None)
64+
# WARNING: Don't set this unless you know what you're doing!
65+
# NOTE: For MoE models (ex. Mixtral) only!
66+
num_experts_per_token:
67+
6368
# Options for draft models (speculative decoding). This will use more VRAM!
6469
draft:
6570
# Overrides the directory to look for draft (default: models)

model.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ def progress(loaded_modules: int, total_modules: int, loading_draft: bool)
105105
# Set prompt template override if provided
106106
self.prompt_template = kwargs.get("prompt_template")
107107

108+
# Set num of experts per token if provided
109+
num_experts_override = kwargs.get("num_experts_per_token")
110+
if num_experts_override:
111+
if hasattr(self.config, "num_experts_per_token"):
112+
self.config.num_experts_per_token = num_experts_override
113+
else:
114+
print(" !! Warning: Currently installed ExLlamaV2 does not support overriding MoE experts")
115+
108116
chunk_size = min(unwrap(kwargs.get("chunk_size"), 2048), self.config.max_seq_len)
109117
self.config.max_input_len = chunk_size
110118
self.config.max_attn_size = chunk_size ** 2

0 commit comments

Comments
 (0)