Skip to content

Commit daa8a03

Browse files
DarkLight1337amd-xiaoyu12
authored andcommitted
[V1] Enable V1 for compute capability < 8.0 + FP32 (vllm-project#23614)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Xiao Yu <xiao.yu@amd.com>
1 parent 1c68e1c commit daa8a03

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

vllm/engine/arg_utils.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,15 +1433,15 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
14331433
recommend_to_remove=True)
14341434
return False
14351435

1436-
# Need at least Ampere for now (FA support required).
1437-
# Skip this check if we are running on a non-GPU platform,
1438-
# or if the device capability is not available
1439-
# (e.g. in a Ray actor without GPUs).
1436+
# Triton v3.3 has f16 conversion regression issue on Turing and Volta,
1437+
# which broke fp16 inference
1438+
# see: https://github.com/triton-lang/triton/issues/6698
14401439
if (current_platform.is_cuda()
1441-
and current_platform.get_device_capability()
1442-
and current_platform.get_device_capability().major < 8):
1443-
_raise_or_fallback(feature_name="Compute Capability < 8.0",
1444-
recommend_to_remove=False)
1440+
and not current_platform.has_device_capability(80)
1441+
and model_config.dtype == torch.float16):
1442+
_raise_or_fallback(
1443+
feature_name="Compute Capability < 8.0 with FP16",
1444+
recommend_to_remove=False)
14451445
return False
14461446

14471447
if self.kv_cache_dtype != "auto":

0 commit comments

Comments
 (0)