cleanup 3

Qubitium · Qubitium · commit 8cc736452434 · 2025-11-08T07:32:52.000Z
diff --git a/gptqmodel/nn_modules/qlinear/torch_fused_awq.py b/gptqmodel/nn_modules/qlinear/torch_fused_awq.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
+import math
+
 import torch
 
 from ...adapter.adapter import Adapter
@@ -21,7 +23,7 @@
 
 
 class TorchFusedAwqQuantLinear(TorchFusedQuantLinear):
-    """Torch fused AWQ variant that reuses the GPTQ fused kernels via CPU int4 packing."""
+    """Torch fused AWQ variant based on GPTQ fused kernels via CPU int4 packing."""
 
     QUANT_TYPE = "torch_fused_awq"
     SUPPORTS_BITS = TorchFusedQuantLinear.SUPPORTS_BITS
@@ -66,62 +68,72 @@ def __init__(
             bias=bias,
             pack_dtype=pack_dtype,
             adapter=adapter,
-            register_buffers=register_buffers,
+            register_buffers=False,
             **kwargs,
         )
+        if register_buffers:
+            qweight_shape = self._awq_qweight_shape()
+            group_size = max(int(self.group_size), 1)
+            group_rows = self._awq_group_count()
+            pack_cols = qweight_shape[1]
 
-    def _load_from_state_dict(
-        self,
-        state_dict,
-        prefix,
-        local_metadata,
-        strict,
-        missing_keys,
-        unexpected_keys,
-        error_msgs,
-    ):
-        qweight_key = prefix + "qweight"
-        awq_tensor = None
-        if qweight_key in state_dict:
-            candidate = state_dict[qweight_key]
-            if not torch.is_tensor(candidate):
-                raise TypeError(f"{qweight_key} must be a tensor to load AWQ weights.")
-            awq_tensor = candidate.to(self.pack_dtype).clone()
-            expected_rows = self.in_features
-            expected_cols = max(1, self.out_features // self.pack_factor)
-            if awq_tensor.shape != (expected_rows, expected_cols):
-                raise ValueError(
-                    f"{self.__class__.__name__} expects AWQ qweight shape "
-                    f"{(expected_rows, expected_cols)}, but received {tuple(awq_tensor.shape)}."
-                )
-            placeholder = getattr(self, "qweight", None)
-            if isinstance(placeholder, torch.Tensor) and placeholder.numel() == awq_tensor.numel():
-                state_dict[qweight_key] = torch.zeros_like(placeholder)
-            else:
-                rows = max(1, self.in_features // self.pack_factor)
-                cols = self.out_features
-                state_dict[qweight_key] = torch.zeros(
-                    (rows, cols),
-                    dtype=self.pack_dtype,
-                    device=awq_tensor.device,
-                )
-        super()._load_from_state_dict(
-            state_dict,
-            prefix,
-            local_metadata,
-            strict,
-            missing_keys,
-            unexpected_keys,
-            error_msgs,
-        )
-        if awq_tensor is not None:
-            state_dict[qweight_key] = awq_tensor
-            device = getattr(self, "qweight", awq_tensor).device
             self.register_buffer(
                 "qweight",
-                awq_tensor.to(device=device, dtype=self.pack_dtype).contiguous(),
-                persistent=True,
+                torch.zeros(qweight_shape, dtype=self.pack_dtype),
             )
+            self.register_buffer(
+                "qzeros",
+                torch.zeros((group_rows, pack_cols), dtype=self.pack_dtype),
+            )
+            self.register_buffer(
+                "scales",
+                torch.zeros((group_rows, self.out_features), dtype=torch.float16),
+            )
+            g_idx = torch.arange(self.in_features, dtype=torch.int32) // group_size
+            self.register_buffer("g_idx", g_idx)
+            if bias:
+                self.register_buffer("bias", torch.zeros(self.out_features, dtype=torch.float16))
+            else:
+                self.bias = None
+
+    def _awq_qweight_shape(self):
+        pack_cols = max(1, self.out_features // self.pack_factor)
+        return self.in_features, pack_cols
+
+    def _awq_group_count(self):
+        group_size = max(int(self.group_size), 1)
+        return max(1, math.ceil(self.in_features / group_size))
+
+    # def _load_from_state_dict(
+    #     self,
+    #     state_dict,
+    #     prefix,
+    #     local_metadata,
+    #     strict,
+    #     missing_keys,
+    #     unexpected_keys,
+    #     error_msgs,
+    # ):
+    #     self.register_awq_buffers()
+    #     super()._load_from_state_dict(
+    #         state_dict,
+    #         prefix,
+    #         local_metadata,
+    #         strict,
+    #         missing_keys,
+    #         unexpected_keys,
+    #         error_msgs,
+    #     )
+    #     qweight = getattr(self, "qweight", None)
+    #     if torch.is_tensor(qweight):
+    #         expected_shape = self._awq_qweight_shape()
+    #         if tuple(qweight.shape) != expected_shape:
+    #             raise ValueError(
+    #                 f"{self.__class__.__name__} only loads AWQ-formatted qweight tensors with "
+    #                 f"shape {expected_shape}, but received {tuple(qweight.shape)}."
+    #             )
+    #         if qweight.dtype != self.pack_dtype:
+    #             self.qweight = qweight.to(dtype=self.pack_dtype).contiguous()
 
     def transform_cpu_awq(self, dtype):
         src_scales = self.scales
diff --git a/tests/test_kernel_output_awq.py b/tests/test_kernel_output_awq.py
@@ -22,6 +22,7 @@
     marlin_import_exception,
 )
 from gptqmodel.nn_modules.qlinear.awq_torch import AwqTorchQuantLinear
+from gptqmodel.nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear
 from gptqmodel.utils.marlin import marlin_make_workspace_new
 
 
@@ -30,6 +31,7 @@
 log = LogBar.shared()
 
 DEVICE = torch.device("cuda:0")
+CPU_DEVICE = torch.device("cpu")
 
 GREEN = "\033[32m"
 RED = "\033[31m"
@@ -50,6 +52,7 @@ class TestAwqKernelOutput(unittest.TestCase):
         (BACKEND.GEMM, torch.float16, 0.004),
         # (BACKEND.GEMM, torch.bfloat16, 0.05),
         (BACKEND.MARLIN, torch.float16, 0.006),
+        (BACKEND.TORCH_FUSED_AWQ, torch.float16, 0.004),
         # (BACKEND.MARLIN, torch.bfloat16, 0.05),
     ]
 
@@ -92,6 +95,16 @@ def setUpClass(cls) -> None:
             qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu
         )
 
+        try:
+            cls.modules[BACKEND.TORCH_FUSED_AWQ] = cls._build_torch_fused_awq_module(
+                qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu
+            )
+        except Exception as exc:
+            cls.backend_skip_reason[BACKEND.TORCH_FUSED_AWQ] = (
+                f"Torch fused AWQ kernel unavailable: {exc}"
+            )
+            cls.modules[BACKEND.TORCH_FUSED_AWQ] = None
+
         base_inputs = cls._generate_inputs()
         cls.inputs: Dict[torch.dtype, List[torch.Tensor]] = {}
         cls.reference_outputs: Dict[torch.dtype, List[torch.Tensor]] = {}
@@ -247,6 +260,35 @@ def _build_torch_awq_module(
         module.post_init()
         return module
 
+    @classmethod
+    def _build_torch_fused_awq_module(
+        cls,
+        qweight_cpu: torch.Tensor,
+        qzeros_cpu: torch.Tensor,
+        scales_cpu: torch.Tensor,
+        bias_cpu: torch.Tensor,
+    ) -> TorchFusedAwqQuantLinear:
+        module = TorchFusedAwqQuantLinear(
+            bits=cls.BITS,
+            group_size=cls.GROUP_SIZE,
+            sym=True,
+            desc_act=False,
+            in_features=cls.in_features,
+            out_features=cls.out_features,
+            bias=True,
+            adapter=None,
+            register_buffers=True,
+        ).to(CPU_DEVICE)
+
+        module.qweight.copy_(qweight_cpu.to(CPU_DEVICE))
+        module.qzeros.copy_(qzeros_cpu.to(CPU_DEVICE))
+        module.scales.copy_(scales_cpu.to(torch.float16).to(CPU_DEVICE))
+        module.bias.copy_(bias_cpu.to(torch.float16).to(CPU_DEVICE))
+
+        module.eval()
+        module.post_init()
+        return module
+
     @classmethod
     def _generate_inputs(cls) -> List[torch.Tensor]:
         large_shapes = [(4, 32), (2, 64), (1, 96)]
@@ -288,19 +330,37 @@ def _forward(
         *,
         compute_dtype: Optional[torch.dtype] = None,
         output_dtype: Optional[torch.dtype] = None,
+        target_device: Optional[torch.device] = None,
     ) -> List[torch.Tensor]:
+        if target_device is None:
+            target_device = cls._infer_module_device(module)
         outputs: List[torch.Tensor] = []
         with torch.inference_mode():
             for tensor in inputs:
                 local_tensor = tensor
-                if compute_dtype is not None and tensor.dtype != compute_dtype:
-                    local_tensor = tensor.to(dtype=compute_dtype)
+                if local_tensor.device != target_device:
+                    local_tensor = local_tensor.to(device=target_device)
+                if compute_dtype is not None and local_tensor.dtype != compute_dtype:
+                    local_tensor = local_tensor.to(dtype=compute_dtype)
                 result = module(local_tensor)
                 if output_dtype is not None and result.dtype != output_dtype:
                     result = result.to(dtype=output_dtype)
                 outputs.append(result.detach().cpu())
         return outputs
 
+    @staticmethod
+    def _infer_module_device(module: torch.nn.Module) -> torch.device:
+        try:
+            tensor = next(module.parameters())
+            return tensor.device
+        except StopIteration:
+            pass
+        try:
+            tensor = next(module.buffers())
+            return tensor.device
+        except StopIteration:
+            return torch.device("cpu")
+
     def _maybe_skip_backend(self, backend: BACKEND) -> None:
         reason = self.backend_skip_reason.get(backend)
         if reason: