Skip to content

Commit

Permalink
[Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (vllm-proje…
Browse files Browse the repository at this point in the history
  • Loading branch information
zeyugao authored Jul 28, 2024
1 parent b1366a9 commit 3eeb148
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions vllm/model_executor/layers/quantization/fbgemm_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
Expand Down Expand Up @@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):

def __init__(self, quant_config: FBGEMMFp8Config):
self.quant_config = quant_config
self.cutlass_fp8_supported = cutlass_fp8_supported()

def create_weights(
self,
Expand Down Expand Up @@ -139,11 +141,12 @@ def apply(self,
size_k=layer.input_size_per_partition,
bias=bias)

return apply_fp8_linear(input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=None,
input_scale_ub=layer.input_scale_ub,
bias=bias,
cutlass_fp8_supported=True,
use_per_token_if_dynamic=True)
return apply_fp8_linear(
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=None,
input_scale_ub=layer.input_scale_ub,
bias=bias,
cutlass_fp8_supported=self.cutlass_fp8_supported,
use_per_token_if_dynamic=True)

0 comments on commit 3eeb148

Please sign in to comment.