Decompose scaled_int_mm for CPU

Xia-Weiwen · Xia-Weiwen · commit 97dfea865e37 · 2024-10-29T02:17:25.000-07:00
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -112,31 +112,6 @@ def int_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
     return safe_int_mm(a, b)
 
 
-# If users install cpu-only pytorch, triton won't be available by default
-# This pass adds definition of int_scaled_matmul for this case
-if intmm_triton:
-    lib = intmm_triton.lib
-else:
-    lib = torch.library.Library("torchao", "FRAGMENT")
-    lib.define("int_scaled_matmul(Tensor a, Tensor b, Tensor scales1) -> Tensor")
-
-
-@torch.library.impl(lib, "int_scaled_matmul", "Meta")
-def int_scaled_matmul_meta(a, b, scales1):
-    M, K = a.shape
-    K, N = b.shape
-    return torch.empty((M, N), device=a.device, dtype=scales1.dtype)
-
-
-@torch.library.impl(lib, "int_scaled_matmul", "CPU")
-def int_scaled_matmul_cpu(a, b, scales1):
-    if TORCH_VERSION_AT_LEAST_2_6:
-        c = torch._int_mm(a, b)
-        return c.to(scales1.dtype) * scales1
-    else:
-        return safe_int_mm(a, b) * scales1
-
-
 def int_scaled_matmul(a: torch.Tensor, b: torch.Tensor, scales1: torch.Tensor) -> torch.Tensor:
     """
     Performs scaled integer matrix multiplication.
@@ -159,10 +134,14 @@ def int_scaled_matmul(a: torch.Tensor, b: torch.Tensor, scales1: torch.Tensor) -
     assert scales1.is_contiguous()
     scales1 = scales1.expand((M, N))
     assert scales1.dim() == 2
-    if (
-            (intmm_triton is not None and AUTOTUNER_ENABLE)
-            or scales1.device.type == "cpu"
-       ):
+
+    if scales1.device.type == "cpu" and TORCH_VERSION_AT_LEAST_2_6:
+        # CPU prefers decomposed version of int_scaled_matmul
+        # to leverage the fusion capability of Inductor
+        c = torch._int_mm(a, b)
+        return c.to(scales1.dtype) * scales1
+
+    if intmm_triton is not None and AUTOTUNER_ENABLE:
         return torch.ops.torchao.int_scaled_matmul(a, b, scales1)
 
     c = safe_int_mm(a, b)
diff --git a/torchao/kernel/intmm_triton.py b/torchao/kernel/intmm_triton.py
@@ -334,6 +334,13 @@ def int_matmul_cuda(a, b):
     return int_matmul_kernel(a, b, c, best_config)
 
 
+@torch.library.impl(lib, "int_scaled_matmul", "Meta")
+def int_scaled_matmul_meta(a, b, scales1):
+    M, K = a.shape
+    K, N = b.shape
+    return torch.empty((M, N), device=a.device, dtype=scales1.dtype)
+
+
 @torch.library.impl(lib, "int_scaled_matmul", "CUDA")
 def int_scaled_matmul_cuda(a, b, scales1):
     # Check constraints.