format fixes

vllm-project · robertgshaw2-neuralmagic · May 23, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
commit d9d49b5224dccb16eb28628ed9fb5f95b07437cc
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -38,7 +38,7 @@ def create_weights(self,
                        output_size: int,
                        params_dtype: torch.dtype,
                        layer_name: Optional[str] = None,
-                       **extra_weight_attrs) -> Dict[str, Any]:
+                       **extra_weight_attrs):
         """Create weights for a linear layer. 
            The weights will be set as attributes of the layer.
 
@@ -84,7 +84,7 @@ def create_weights(self,
                        output_size: int,
                        params_dtype: torch.dtype,
                        layer_name: Optional[str] = None,
-                       **extra_weight_attrs) -> Dict[str, Any]:
+                       **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
                                        dtype=params_dtype),
@@ -413,7 +413,7 @@ def weight_loader(self,
             param_data = param_data.narrow(0, shard_offset, shard_size)
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths")
+            logical_widths = getattr(param, "logical_widths", None)
             param_data, loaded_weight = param_shard_splitter(
                 param_data, loaded_weight, loaded_shard_id, logical_widths)
 
@@ -601,7 +601,7 @@ def weight_loader(self,
                                            shard_size)
         # If a param_shard_splitter is defined by the LinearMethod, use it.
         elif param_shard_splitter is not None:
-            logical_widths = getattr(param, "logical_widths")
+            logical_widths = getattr(param, "logical_widths", None)
             param_data, loaded_weight = param_shard_splitter(
                 param_data, loaded_weight, loaded_shard_id, logical_widths)
 

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,9 +4,9 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsConfig)
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -5,11 +5,9 @@
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
-
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsW8A8StaticTensor, CompressedTensorsUnquantized,
-    CompressedTensorsScheme)
-from vllm.model_executor.utils import set_weight_attrs
+    CompressedTensorsScheme, CompressedTensorsUnquantized,
+    CompressedTensorsW8A8StaticTensor)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -138,8 +136,8 @@ def create_weights(self,
                        layer_name: Optional[str] = None,
                        **extra_weight_attrs):
         """
-        Use the CompressedTensorsScheme associated with each layer to create the 
-        necessary parameters for the layer.
+        Use the CompressedTensorsScheme associated with each layer to create 
+        the necessary parameters for the layer.
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
 
@@ -160,8 +158,9 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None):
         """
-        Use the output of create_weights and the CompressedTensorsScheme associated with 
-        the layer to apply the forward pass with the layer input.
+        Use the output of create_weights and the CompressedTensorsScheme 
+        associated with the layer to apply the forward pass with the 
+        layer input.
         """
 
         if bias is not None:

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/cutlass_gemm.py b/vllm/model_executor/layers/quantization/compressed_tensors/cutlass_gemm.py
@@ -1,9 +1,9 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
 import cutlass
-from cutlass import Tensor as FakeTensor
 import cutlass.epilogue
-
 import torch
-from typing import Optional, Tuple, Dict, Union, Any
+from cutlass import Tensor as FakeTensor
 
 from vllm.logger import init_logger
 

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,3 +1,5 @@
-from .compressed_tensors_scheme import CompressedTensorsScheme
-from .compressed_tensors_unquantized import CompressedTensorsUnquantized
-from .compressed_tensors_w8a8_statictensor import CompressedTensorsW8A8StaticTensor
+from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
+from .compressed_tensors_unquantized import (  # noqa: F401
+    CompressedTensorsUnquantized)
+from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
+    CompressedTensorsW8A8StaticTensor)
diff --git a/...odel_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/...odel_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,13 +1,14 @@
 from abc import ABC, abstractmethod
+
 import torch
 
 __all__ = ["CompressedTensorsScheme"]
 
 
 class CompressedTensorsScheme(ABC):
     """
-    Abstract class used to describe the weight creation and forward pass of different
-    quantization schemes supported by CompressedTensors.
+    Abstract class used to describe the weight creation and forward pass 
+    of different quantization schemes supported by CompressedTensors.
     """
 
     @abstractmethod
@@ -21,11 +22,11 @@ def create_weights(self, *args, **kwargs):
     @abstractmethod
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         """
-        Run the forward pass for the particular scheme. This is where scheme-specific
-        dequant/quant steps/kernels should be applied.
+        Run the forward pass for the particular scheme. This is where 
+        scheme-specific dequant/quant steps/kernels should be applied.
 
-        :param layer: toch.nn.Module with the registered weights and other parameters 
-            relevant to the particular scheme. 
+        :param layer: toch.nn.Module with the registered weights and 
+            other parameters relevant to the particular scheme. 
         :param x: input to the layer
 
         """

diff --git a/...executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/...executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -1,18 +1,21 @@
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
+from typing import Callable, List
+
 import torch
-from typing import List, Callable
+import torch.nn.functional as F
 from torch.nn import Parameter
+
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
 from vllm.model_executor.utils import set_weight_attrs
-import torch.nn.functional as F
 
 __all__ = ["CompressedTensorsUnquantized"]
 
 
 class CompressedTensorsUnquantized(CompressedTensorsScheme):
     """
-    Implements the scheme for all layers which are ignored in the CompressedTensors 
-    config. The input and loaded weight are used in a linear transformation.
+    Implements the scheme for all layers which are ignored 
+    in the CompressedTensors config. The input and loaded weight are used 
+    in a linear transformation.
     """
 
     def create_weights(self, layer: torch.nn.Module,

diff --git a/...or/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/...or/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -1,12 +1,14 @@
+from typing import Callable, List, Tuple, Union
+
 import torch
-from typing import List, Union, Tuple, Callable
-from vllm.model_executor.layers.quantization.compressed_tensors.cutlass_gemm import (
+from torch.nn import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.quantization.compressed_tensors.cutlass_gemm import (  # noqa: E501
     cutlass_gemm_dq)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.utils import set_weight_attrs
-from torch.nn import Parameter
-from vllm._C import ops
 
 __all__ = ["CompressedTensorsW8A8StaticTensor"]
 
@@ -94,7 +96,7 @@ def create_weights(self, layer: torch.nn.Module,
 
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        # Register parameter with the layer; register weight loader with each parameter
+
         set_weight_attrs(weight, {"weight_loader": weight_loader})
         set_weight_attrs(weight, {"logical_widths": output_partition_sizes})
 
@@ -122,8 +124,8 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         x_q = self._quantize_single(x, act_scale[0].item())
 
         # Weight quantize
-        # TODO : try not to remove device-to-host copy. i.e. keep the non-duplicated version
-        # of scales in the CPU
+        # TODO : try not to remove device-to-host copy.
+        # i.e. keep the non-duplicated version of scales in the CPU
         if self.fake_quant:
             w_scales = [
                 weight_scale[sum(logical_widths[:i])].item()

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -206,6 +206,7 @@ def create_weights(
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
+        layer_name: Optional[str] = None,
         **extra_weight_attrs,
     ) -> None:
         del output_size