From 77811c60cf1617298fdeda01c1d9d176ed2056ef Mon Sep 17 00:00:00 2001
From: Casper Hansen <casperbh.96@gmail.com>
Date: Fri, 26 Jan 2024 10:53:17 +0000
Subject: [PATCH] Apply code suggestions

---
 setup.py                                       | 4 +---
 vllm/model_executor/layers/quantization/awq.py | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 267dc970e12e7..fb37a8d952314 100644
--- a/setup.py
+++ b/setup.py
@@ -255,9 +255,7 @@ def get_torch_arch_list() -> Set[str]:
 ]
 
 if _is_cuda():
-    vllm_extension_sources.extend([
-        "csrc/quantization/awq/gemm_kernels.cu",
-    ])
+    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
 
 if not _is_neuron():
     vllm_extension = CUDAExtension(
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 4d80bea676a67..4d3fd3ec0cc71 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -154,8 +154,8 @@ def apply_weights(self,
         out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
         reshaped_x = x.reshape(-1, x.shape[-1])
 
-        # batch_size*seq_len >= threshold
-        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 256
+        # num_tokens >= threshold
+        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
 
         if FP16_MATMUL_HEURISTIC_CONDITION:
             out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)