Skip to content

Commit

Permalink
[Bugfix] Only add Attention.kv_scale if kv cache quantization is en…
Browse files Browse the repository at this point in the history
…abled (vllm-project#5936)

Signed-off-by: Alvant <alvasian@yandex.ru>
  • Loading branch information
mgoin authored and Alvant committed Oct 26, 2024
1 parent 76029b0 commit df739a9
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions vllm/attention/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vllm.config import CacheConfig
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod


class Attention(nn.Module):
Expand Down Expand Up @@ -56,15 +57,19 @@ def __init__(
quant_method = quant_config.get_quant_method(
self) if quant_config else None
if quant_method is not None:
if self.kv_cache_dtype == "fp8_e5m2":
raise ValueError("fp8_e5m2 kv-cache is not supported with "
"fp8 checkpoints.")
# When FP8 quantization is enabled, we make a parameter
# "kv_scale" so that it can be loaded from FP8 checkpoint.
# The kv_scale will then be converted back
# to self._kv_scale in a native float32 value after weight loading.
self.quant_method = quant_method
self.quant_method.create_weights(self)
assert isinstance(quant_method, Fp8KVCacheMethod)
# TODO (mgoin): kv cache dtype should be specified in the FP8
# checkpoint config and become the "auto" behavior
if "fp8" in self.kv_cache_dtype:
if self.kv_cache_dtype == "fp8_e5m2":
raise ValueError("fp8_e5m2 kv-cache is not supported with "
"fp8 checkpoints.")
# When FP8 quantization is enabled, we make a parameter
# "kv_scale" so that it can be loaded from FP8 checkpoint.
# The kv_scale will then be converted back to self._kv_scale
# in a native float32 value after weight loading.
self.quant_method = quant_method
self.quant_method.create_weights(self)

# During model initialization, the default dtype is set as the model
# weight and activation dtype.
Expand Down

0 comments on commit df739a9

Please sign in to comment.