vllm-project · mgoin · Feb 27, 2025 · Feb 27, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -96,6 +96,7 @@
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_USE_DEEPGEMM: bool = False
 
 
 def get_default_cache_root():
@@ -630,6 +631,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
+
+    # Whether to use DeepGEMM for inference
+    "VLLM_USE_DEEPGEMM":
+    lambda: os.environ.get("VLLM_USE_DEEPGEMM", "0") == "1",
 }
 
 # end-env-vars-definition

@@ -11,6 +11,7 @@
 import triton.language as tl
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     _normalize_quant_group_shape, scaled_dequantize)
@@ -58,7 +59,16 @@ def apply_w8a8_block_fp8_linear(
         if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0])
                 or br not in (1, weight.shape[0])):
             shape_supported_by_cutlass = False
-    if cutlass_block_fp8_supported and shape_supported_by_cutlass:
+
+    if envs.VLLM_USE_DEEPGEMM and block_size == [128, 128]:
+        import deep_gemm
+        q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1])
+        output = torch.empty((input_2d.shape[0], weight.shape[0]),
+                             device=input.device,
+                             dtype=input.dtype)
+        deep_gemm.gemm_fp8_fp8_bf16_nt((q_input, x_scale),
+                                       (weight, weight_scale), output)
+    elif cutlass_block_fp8_supported and shape_supported_by_cutlass:
         q_input, x_scale = per_token_group_quant_fp8(input_2d,
                                                      block_size[1],
                                                      column_major_scales=True)