Skip to content

Commit c7e1e07

Browse files
tlrmchlsmthLeiWang1999
authored andcommitted
[Kernel] Use CUTLASS kernels for the FP8 layers with Bias (vllm-project#6270)
Signed-off-by: LeiWang1999 <leiwang1999@outlook.com>
1 parent 81d86bb commit c7e1e07

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

vllm/model_executor/layers/quantization/utils/w8a8_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,16 @@ def apply_fp8_linear(
112112
# If dynamic, layer.input_scale is None and x_scale computed from x.
113113
# If static, layer.input_scale is scalar and x_scale is input_scale.
114114

115-
if bias is None and cutlass_fp8_supported:
115+
if cutlass_fp8_supported:
116116
qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
117117

118118
# Fused GEMM_DQ
119119
output = ops.cutlass_scaled_mm(qinput,
120120
weight,
121121
out_dtype=input.dtype,
122122
scale_a=x_scale,
123-
scale_b=weight_scale)
123+
scale_b=weight_scale,
124+
bias=bias)
124125

125126
else:
126127
qinput, x_scale = ops.scaled_fp8_quant(input,

0 commit comments

Comments
 (0)