review comments

LucasWilkinson · LucasWilkinson · commit 62508c5737d5 · 2024-09-13T22:21:58.000Z
diff --git a/csrc/permute_cols.cu b/csrc/permute_cols.cu
@@ -10,6 +10,7 @@ static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
+// Currently only supports 16bit types (since we permute halfs)
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
                                     int4* __restrict__ out_int4_ptr, int size_m,
@@ -61,26 +62,27 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
   }
 }
 
-// More efficient version of A[:, perm]
+// More efficient version of A[..., perm]
 //  taken from gptq_marlin.cu
 torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
   auto dev = A.get_device();
   auto stream = at::cuda::getCurrentCUDAStream(dev);
 
   TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
-              "Only half and bfloat16 are supported");
+              "Currently only 16bit types are supported");
   TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
-  TORCH_CHECK(A.size(1) % 8 == 0,
+  TORCH_CHECK(A.size(-1) % 8 == 0,
               "A columns must be a multiple of 8 (128bits)");
+  auto A_2d = A.view({-1, A.size(-1)});
 
   torch::Tensor D = torch::empty_like(A);
   int sms;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
-  int block_rows = div_ceil(A.size(0), sms);
+  int block_rows = div_ceil(A_2d.size(0), sms);
   permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
-      reinterpret_cast<int4 const*>(A.const_data_ptr()),
+      reinterpret_cast<int4 const*>(A_2d.const_data_ptr()),
       perm.const_data_ptr<int>(), reinterpret_cast<int4*>(D.mutable_data_ptr()),
-      A.size(0), A.size(1), block_rows);
+      A_2d.size(0), A_2d.size(1), block_rows);
   return D;
 }
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
@@ -335,6 +335,7 @@ def generate():
     )
 
     # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
     default_heuristic = [
         #### M = 257+
         (
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -152,8 +152,8 @@ struct MacheteKernelTemplate {
 
     int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A);
 
-    int group_size = maybe_group_size.value_or(K);
-    group_size = (group_size == -1) ? K : group_size;
+    int const group_size =
+        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
     int const scale_k = (K + group_size - 1) / group_size;
 
     TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
diff --git a/vllm/model_executor/layers/quantization/kernels/__init__.py b/vllm/model_executor/layers/quantization/kernels/__init__.py
@@ -1,12 +1,14 @@
 import os
 from typing import List, Optional, Type
 
+from vllm.model_executor.layers.quantization.kernels.machete import (
+    MacheteLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.marlin import (
+    MarlinLinearKernel)
+from vllm.model_executor.layers.quantization.kernels.MPLinearKernel import (
+    MPLinearKernel, MPLinearLayerConfig)
 from vllm.platforms import current_platform
 
-from .MacheteLinearKernel import MacheteLinearKernel
-from .MarlinLinearKernel import MarlinLinearKernel
-from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
-
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
     MacheteLinearKernel,
@@ -17,6 +19,24 @@
 def choose_mp_linear_kernel(
         config: MPLinearLayerConfig,
         compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+    """
+    Choose an MPLinearKernel that can implement the given config for the given
+     compute capability. Attempts to choose the best kernel in terms of 
+     performance.
+
+    Args:
+        config (MPLinearLayerConfig): Description of the linear layer to be 
+          implemented.
+        compute_capability (Optional[int], optional): The compute capability of
+          the target device, if None uses `current_platform` to get the compute 
+          capability. Defaults to None.
+
+    Raises:
+        ValueError: If no kernel can implement the given config.
+
+    Returns:
+        Type[MPLinearKernel]: Chosen kernel.
+    """
     if compute_capability is None:
         if current_platform is None:
             raise ValueError("Cannot determine compute capability")
diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/machete.py
@@ -1,4 +1,7 @@
 from functools import partial
+from typing import Optional, Tuple
+
+import torch
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.machete_utils import (
@@ -9,7 +12,7 @@
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 
-from .MPLinearKernel import *
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class MacheteLinearKernel(MPLinearKernel):
diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/marlin.py
@@ -1,3 +1,7 @@
+from typing import Optional, Tuple
+
+import torch
+
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
@@ -7,7 +11,7 @@
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 
-from .MPLinearKernel import *
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
 
 class MarlinLinearKernel(MPLinearKernel):
@@ -111,7 +115,7 @@ def apply_weights(self,
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
 
-        # `process_weights_after_loading`` will ensure w_zp and w_gidx are not
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
         #  None for marlin
         return apply_gptq_marlin_linear(
             input=x,

Original file line number	Diff line number	Diff line change
`@@ -335,6 +335,7 @@ def generate():`
`335`	`335`	`)`
`336`	`336`
`337`	`337`	`# For now we use the same heuristic for all types`
	`338`	`+ # Heuristic is currently tuned for H100s`
`338`	`339`	`default_heuristic = [`
`339`	`340`	`#### M = 257+`
`340`	`341`	`(`