Skip to content

Commit 408c66a

Browse files
committed
Model: Change FA2 and paged attention checks
The dynamic generator requires Flash attention 2.5.7 or higher to be installed. This is only supported on Nvidia's 30 series and higher. If a card is AMD or lower than the 30 series, switch to compatability mode which functions the same way as the older generator, except without parallel batching and any features that depend on it, such as CFG. Signed-off-by: kingbri <bdashore3@proton.me>
1 parent c2d3675 commit 408c66a

File tree

3 files changed

+31
-35
lines changed

3 files changed

+31
-35
lines changed

backends/exllamav2/model.py

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import gc
44
import math
55
import pathlib
6-
import threading
7-
import time
86
import traceback
97
import torch
108
import uuid
@@ -57,10 +55,11 @@ class ExllamaV2Container:
5755
generator: Optional[ExLlamaV2DynamicGeneratorAsync] = None
5856
prompt_template: Optional[PromptTemplate] = None
5957
active_loras: List[ExLlamaV2Lora] = []
58+
paged: bool = True
6059

6160
# Internal config vars
6261
cache_mode: str = "FP16"
63-
use_cfg: bool = False
62+
max_batch_size: int = 20
6463
generation_config: Optional[GenerationConfig] = None
6564

6665
# GPU split vars
@@ -115,10 +114,6 @@ def progress(loaded_modules: int, total_modules: int,
115114
available devices (default: True)
116115
'gpu_split' (list[float]): Allocation for weights and (some)
117116
tensors, per device
118-
'no_flash_attn' (bool): Turns off flash attention
119-
(increases vram usage) (default: False)
120-
'use_cfg" (bool): Enables CFG support. Disables flash attention
121-
(default: False)
122117
"""
123118

124119
self.quiet = quiet
@@ -184,18 +179,9 @@ def progress(loaded_modules: int, total_modules: int,
184179
kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len)
185180
)
186181

187-
# Enable CFG if present
188-
self.use_cfg = unwrap(kwargs.get("use_cfg"), False)
189-
190182
# Enable fasttensors loading if present
191183
self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False)
192184

193-
# Turn off flash attention if CFG is on
194-
# Workaround until batched FA2 is fixed in exllamav2 upstream
195-
# self.config.no_flash_attn = (
196-
# True if self.use_cfg else unwrap(kwargs.get("no_flash_attention"), False)
197-
# )
198-
199185
# Try to set prompt template
200186
self.prompt_template = self.find_prompt_template(
201187
kwargs.get("prompt_template"), model_directory
@@ -345,7 +331,6 @@ def get_model_parameters(self):
345331
"cache_mode": self.cache_mode,
346332
"chunk_size": self.config.max_input_len,
347333
"num_experts_per_token": self.config.num_experts_per_token,
348-
"use_cfg": self.use_cfg,
349334
"prompt_template": self.prompt_template.name
350335
if self.prompt_template
351336
else None,
@@ -420,10 +405,24 @@ async def load_gen(self, progress_callback=None):
420405
async for value in iterate_in_threadpool(model_load_generator):
421406
yield value
422407

423-
# TODO: Change these!
424-
# Set the max batch size and check if paged support is available
425-
max_batch_size = 1 if self.config.no_flash_attn else 20
426-
paged = not self.config.no_flash_attn
408+
# Disable paged mode if the user's min GPU is supported (ampere and above)
409+
min_compute_capability = min(
410+
set(
411+
[
412+
torch.cuda.get_device_capability(device=module.device_idx)[0]
413+
for module in self.model.modules
414+
if module.device_idx >= 0
415+
]
416+
)
417+
)
418+
419+
if torch.version.hip or min_compute_capability < 8:
420+
logger.warning(
421+
"An unsupported GPU is found in this configuration. "
422+
"Switching to compatibility mode. This disables parallel batching."
423+
)
424+
self.paged = False
425+
self.max_batch_size = 1
427426

428427
# Create async generator
429428
self.generator = ExLlamaV2DynamicGeneratorAsync(
@@ -432,8 +431,8 @@ async def load_gen(self, progress_callback=None):
432431
draft_model=self.draft_model,
433432
draft_cache=self.draft_cache,
434433
tokenizer=self.tokenizer,
435-
max_batch_size=max_batch_size,
436-
paged=paged,
434+
max_batch_size=self.max_batch_size,
435+
paged=self.paged,
437436
)
438437

439438
# Clean up any extra vram usage from torch and cuda
@@ -741,7 +740,7 @@ async def generate_gen(self, prompt: str, **kwargs):
741740
cfg_scale = unwrap(kwargs.get("cfg_scale"), 1.0)
742741
negative_prompt = None
743742
if cfg_scale not in [None, 1.0]:
744-
if self.use_cfg:
743+
if self.paged:
745744
gen_settings.cfg_scale = cfg_scale
746745

747746
# If the negative prompt is empty, use the BOS token
@@ -752,8 +751,8 @@ async def generate_gen(self, prompt: str, **kwargs):
752751
prompts.append(negative_prompt)
753752
else:
754753
logger.warning(
755-
"CFG is currently disabled. "
756-
"If your GPU is supported, reload your model with use_cfg = True"
754+
"CFG is currently disabled because paged mode is disabled. "
755+
"Please use an ampere (30 series) or higher GPU for CFG support."
757756
)
758757

759758
gen_settings.token_repetition_penalty = unwrap(

config_sample.yml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,6 @@ model:
100100
# Leave blank to automatically calculate alpha
101101
#rope_alpha: 1.0
102102

103-
# Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
104-
#no_flash_attention: False
105-
106103
# Enable different cache modes for VRAM savings (slight performance hit).
107104
# Possible values FP16, FP8, Q4. (default: FP16)
108105
#cache_mode: FP16
@@ -111,6 +108,12 @@ model:
111108
# NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
112109
#chunk_size: 2048
113110

111+
# Set the maximum amount of prompts to process at one time (batch)
112+
# This will be automatically adjusted depending on the cache size.
113+
# A max batch size of 1 processes prompts one at a time.
114+
# NOTE: Only available for Nvidia ampere (30 series) and above GPUs
115+
#max_batch_size: 20
116+
114117
# Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
115118
# If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
116119
# of the template you want to use.
@@ -122,10 +125,6 @@ model:
122125
# NOTE: For MoE models (ex. Mixtral) only!
123126
#num_experts_per_token:
124127

125-
# Enables CFG support (default: False)
126-
# WARNING: This flag disables Flash Attention! (a stopgap fix until it's fixed in upstream)
127-
#use_cfg: False
128-
129128
# Enables fasttensors to possibly increase model loading speeds (default: False)
130129
#fasttensors: true
131130

endpoints/OAI/types/model.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ class ModelCardParameters(BaseModel):
1919
chunk_size: Optional[int] = 2048
2020
prompt_template: Optional[str] = None
2121
num_experts_per_token: Optional[int] = None
22-
use_cfg: Optional[bool] = None
2322

2423
# Draft is another model, so include it in the card params
2524
draft: Optional["ModelCard"] = None
@@ -94,7 +93,6 @@ class ModelLoadRequest(BaseModel):
9493
chunk_size: Optional[int] = 2048
9594
prompt_template: Optional[str] = None
9695
num_experts_per_token: Optional[int] = None
97-
use_cfg: Optional[bool] = None
9896
fasttensors: Optional[bool] = False
9997
draft: Optional[DraftModelLoadRequest] = None
10098
skip_queue: Optional[bool] = False

0 commit comments

Comments
 (0)