From 77811c60cf1617298fdeda01c1d9d176ed2056ef Mon Sep 17 00:00:00 2001 From: Casper Hansen Date: Fri, 26 Jan 2024 10:53:17 +0000 Subject: [PATCH] Apply code suggestions --- setup.py | 4 +--- vllm/model_executor/layers/quantization/awq.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 267dc970e12e7..fb37a8d952314 100644 --- a/setup.py +++ b/setup.py @@ -255,9 +255,7 @@ def get_torch_arch_list() -> Set[str]: ] if _is_cuda(): - vllm_extension_sources.extend([ - "csrc/quantization/awq/gemm_kernels.cu", - ]) + vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") if not _is_neuron(): vllm_extension = CUDAExtension( diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 4d80bea676a67..4d3fd3ec0cc71 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -154,8 +154,8 @@ def apply_weights(self, out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, )) reshaped_x = x.reshape(-1, x.shape[-1]) - # batch_size*seq_len >= threshold - FP16_MATMUL_HEURISTIC_CONDITION = x.shape[0] * x.shape[1] >= 256 + # num_tokens >= threshold + FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 if FP16_MATMUL_HEURISTIC_CONDITION: out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)