Skip to content

Commit 6b2ef5c

Browse files
authored
[Bug] Fix Attention when ignored in by quant_method (#14313)
Signed-off-by: mgoin <mgoin64@gmail.com>
1 parent 958adce commit 6b2ef5c

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

vllm/attention/layer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
1212
from vllm.config import CacheConfig, get_current_vllm_config
1313
from vllm.forward_context import ForwardContext, get_forward_context
14+
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
1415
from vllm.model_executor.layers.quantization.base_config import (
1516
QuantizationConfig)
1617
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -97,7 +98,8 @@ def __init__(
9798

9899
quant_method = quant_config.get_quant_method(
99100
self, prefix=prefix) if quant_config else None
100-
if quant_method is not None:
101+
if quant_method is not None and not isinstance(
102+
quant_method, UnquantizedLinearMethod):
101103
assert isinstance(quant_method, BaseKVCacheMethod)
102104
# TODO (mgoin): kv cache dtype should be specified in the FP8
103105
# checkpoint config and become the "auto" behavior

0 commit comments

Comments
 (0)