updated for nonuniform

vllm-project · mgoin · Jul 20, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
commit 9aa66d3565daceafc9fad7407f5339c5a27a5ce2
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -34,6 +34,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         if cache_config is not None:
@@ -56,7 +57,7 @@ def __init__(
         self._k_scale = 1.0
         self._v_scale = 1.0
         quant_method = quant_config.get_quant_method(
-            self) if quant_config else None
+            self, prefix=prefix) if quant_config else None
         if quant_method is not None:
             assert isinstance(quant_method, Fp8KVCacheMethod)
             # TODO (mgoin): kv cache dtype should be specified in the FP8

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -141,6 +141,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -155,7 +156,7 @@ def __init__(
             self.quant_method: Optional[
                 QuantizeMethodBase] = UnquantizedLinearMethod()
         else:
-            self.quant_method = quant_config.get_quant_method(self)
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
@@ -184,7 +185,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, prefix=prefix)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
@@ -260,7 +261,7 @@ def __init__(self,
                  output_sizes: Optional[List[int]] = None,
                  prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, prefix)
 
         self.gather_output = gather_output
 
@@ -709,7 +710,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, prefix)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
@@ -98,11 +98,12 @@ def get_from_keys_or(config: Dict[str, Any], keys: List[str],
 
     @abstractmethod
     def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional[QuantizeMethodBase]:
+            self, layer: torch.nn.Module, prefix: str) -> Optional[QuantizeMethodBase]:
         """Get the quantize method to use for the quantized layer.
 
         Args:
             layer: The layer for the quant method.
+            prefix: The full name of the layer in the state dict
         Returns:
             The quantize method. None if the given layer doesn't support quant
             method.

diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -5,7 +5,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -15,16 +15,28 @@
 logger = init_logger(__name__)
 
 
+# Note: this is a hack. We should update each model to register the 
+# stacked params and get it from there instead in a future PR.
+# fused_name: List[shard_name]
+_FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"]
+}
+
+
 class FBGEMMFp8Config(QuantizationConfig):
     """Config class for FBGEMM Fp8."""
 
+    def __init__(self, ignore_list: List[str]):
+        self.ignore_list = ignore_list
+
     @classmethod
     def get_name(cls) -> str:
         return "fbgemm_fp8"
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16]
+        return [torch.bfloat16, torch.float16]
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -36,11 +48,42 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config":
-        return cls()
-
+        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
+        return cls(ignore_list=ignore_list)
+
+    def _is_layer_skipped(self, prefix: str) -> bool:
+        # prefix: model.layers.0.self_attn.q_proj
+        # proj_name: q_proj
+        proj_name = prefix.split(".")[-1]
+        if proj_name in _FUSED_LAYER_NAME_MAPPING:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name) for
+                shard_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = shard_prefix in self.ignore_list
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision."
+                    )
+        else:
+            is_skipped = prefix in self.ignore_list
+
+        assert is_skipped is not None
+        return is_skipped
+
     def get_quant_method(
-            self, layer: torch.nn.Module) -> Optional["QuantizeMethodBase"]:
+            self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
+            if self._is_layer_skipped(prefix):
+                return UnquantizedLinearMethod()
             return FBGEMMFp8LinearMethod(self)
         return None
 

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -161,6 +161,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         org_num_embeddings: original vocabulary size (without LoRA).
         padding_size: padding size for the vocabulary.
         quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
     """  # noqa: E501
 
     def __init__(self,
@@ -169,7 +170,8 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
                  padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
         # Keep the input dimensions.
@@ -195,7 +197,7 @@ def __init__(self,
 
         linear_method = None
         if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self)
+            linear_method = quant_config.get_quant_method(self, prefix=prefix)
         if linear_method is None:
             linear_method = UnquantizedLinearMethod()
         self.linear_method: QuantizeMethodBase = linear_method
@@ -382,9 +384,10 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  org_num_embeddings: Optional[int] = None,
                  padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__(num_embeddings, embedding_dim, params_dtype,
-                         org_num_embeddings, padding_size, quant_config)
+                         org_num_embeddings, padding_size, quant_config, prefix)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.num_embeddings_per_partition,