From 191bd7701381ea8b162697a072044b8db7bffa17 Mon Sep 17 00:00:00 2001
From: LP <lipi26@foxmail.com>
Date: Tue, 21 Jan 2025 14:42:32 +0800
Subject: [PATCH] fix the issue of qlinear packing being too slow. (#770)

The `for` loop operation in `pack` function is too slowly, replace it with tensor operation.
---
 auto_gptq/nn_modules/qlinear/qlinear_exllama.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/auto_gptq/nn_modules/qlinear/qlinear_exllama.py b/auto_gptq/nn_modules/qlinear/qlinear_exllama.py
index cfff42a7..fe540c19 100644
--- a/auto_gptq/nn_modules/qlinear/qlinear_exllama.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_exllama.py
@@ -134,14 +134,7 @@ def pack(self, linear, scales, zeros, g_idx=None):
         if linear.bias is not None:
             self.bias = linear.bias.clone().half()
 
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
-                    :, None
-                ]
-            )
-        intweight = torch.cat(intweight, dim=1)
+        intweight = torch.round((W + scale_zeros[g_idx].T) / scales[g_idx].T).to(torch.int)
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)