We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 81d86bb commit c7e1e07Copy full SHA for c7e1e07
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -112,15 +112,16 @@ def apply_fp8_linear(
112
# If dynamic, layer.input_scale is None and x_scale computed from x.
113
# If static, layer.input_scale is scalar and x_scale is input_scale.
114
115
- if bias is None and cutlass_fp8_supported:
+ if cutlass_fp8_supported:
116
qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
117
118
# Fused GEMM_DQ
119
output = ops.cutlass_scaled_mm(qinput,
120
weight,
121
out_dtype=input.dtype,
122
scale_a=x_scale,
123
- scale_b=weight_scale)
+ scale_b=weight_scale,
124
+ bias=bias)
125
126
else:
127
qinput, x_scale = ops.scaled_fp8_quant(input,
0 commit comments