Apply code suggestions

vllm-project · Jan 26, 2024 · 77811c6 · 77811c6
1 parent 4149086
commit 77811c6
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 5 deletions.
diff --git a/setup.py b/setup.py
@@ -255,9 +255,7 @@ def get_torch_arch_list() -> Set[str]:
 ]
 
 if _is_cuda():
-    vllm_extension_sources.extend([
-        "csrc/quantization/awq/gemm_kernels.cu",
-    ])
+    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
 
 if not _is_neuron():
     vllm_extension = CUDAExtension(

diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -154,8 +154,8 @@ def apply_weights(self,
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])
 
-        # batch_size*seq_len >= threshold
-        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 256
+        # num_tokens >= threshold
+        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
 
         if FP16_MATMUL_HEURISTIC_CONDITION:
             out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)