From 191bd7701381ea8b162697a072044b8db7bffa17 Mon Sep 17 00:00:00 2001 From: LP Date: Tue, 21 Jan 2025 14:42:32 +0800 Subject: [PATCH] fix the issue of qlinear packing being too slow. (#770) The `for` loop operation in `pack` function is too slowly, replace it with tensor operation. --- auto_gptq/nn_modules/qlinear/qlinear_exllama.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/auto_gptq/nn_modules/qlinear/qlinear_exllama.py b/auto_gptq/nn_modules/qlinear/qlinear_exllama.py index cfff42a7..fe540c19 100644 --- a/auto_gptq/nn_modules/qlinear/qlinear_exllama.py +++ b/auto_gptq/nn_modules/qlinear/qlinear_exllama.py @@ -134,14 +134,7 @@ def pack(self, linear, scales, zeros, g_idx=None): if linear.bias is not None: self.bias = linear.bias.clone().half() - intweight = [] - for idx in range(self.infeatures): - intweight.append( - torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[ - :, None - ] - ) - intweight = torch.cat(intweight, dim=1) + intweight = torch.round((W + scale_zeros[g_idx].T) / scales[g_idx].T).to(torch.int) intweight = intweight.t().contiguous() intweight = intweight.numpy().astype(np.uint32)