[Inductor] Fix consolidating _scaled_mm into mm template TMA error (pytorch#150686)

PaulZhang12 · amathewc · commit 1620f75d0697 · 2025-04-17T07:03:31.000+03:00
Summary: The previous diff broke a few tests that didn't run on internal or GH CI: T220169086, this fixes that issue. The {% if } block is only supposed to support autotuned parameters (constexpr), and should not be used for locals based on other examples. Test Plan: buck test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- --exact 'caffe2/test/inductor:fp8 - test_tensorwise_scaling_bfloat16_shape_16,32,32_has_bias_False_use_fast_accum_True_persistent_matmul_True (caffe2.test.inductor.test_fp8.TestFP8Lowering)' Reviewed By: NikhilAPatel Differential Revision: D72460516 Pull Request resolved: pytorch#150686 Approved by: https://github.com/eellison, https://github.com/NikhilAPatel
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
@@ -312,18 +312,18 @@
             allow_tf32=ALLOW_TF32,
         )
 
-        {% if ki == k_tiles - 1 %}
-        # rematerialize rm and rn to save registers
-        rcm = rm + tl.arange(0, BLOCK_M)
-        rcn = rn + tl.arange(0, BLOCK_N)
-        idx_m = rcm[:, None]
-        idx_n = rcn[None, :]
-        mask = (idx_m < M) & (idx_n < N)
-
-        # inductor generates a suffix
-        {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
-        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-        {% endif %}
+        if ki == k_tiles - 1:
+            # rematerialize rm and rn to save registers
+            rcm = rm + tl.arange(0, BLOCK_M)
+            rcn = rn + tl.arange(0, BLOCK_N)
+            idx_m = rcm[:, None]
+            idx_n = rcn[None, :]
+            mask = (idx_m < M) & (idx_n < N)
+
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
+            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+
 """,
 )
 
@@ -467,31 +467,30 @@ def apply_scaling(
         else:
             accumulator += tl.dot(a, b.T)
 
-        {% if ki == k_tiles - 1 %}
-        # Apply inverse scaling
-        offs_cm = offs_am + tl.arange(0, BLOCK_M)
-        offs_cn = offs_bn + tl.arange(0, BLOCK_N)
-        # Apply scaling
-        accumulator = apply_scaling(
-            accumulator,
-            a_scale,
-            b_scale,
-            SCALING_ROWWISE,
-            offs_cm,
-            offs_cn,
-            M,
-            N,
-            stride_a_scale_m,
-            stride_b_scale_n,
-        )
+        if ki == k_tiles - 1:
+            # Apply inverse scaling
+            offs_cm = offs_am + tl.arange(0, BLOCK_M)
+            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
+            # Apply scaling
+            accumulator = apply_scaling(
+                accumulator,
+                a_scale,
+                b_scale,
+                SCALING_ROWWISE,
+                offs_cm,
+                offs_cn,
+                M,
+                N,
+                stride_a_scale_m,
+                stride_b_scale_n,
+            )
 
-        idx_m = offs_cm[:, None]
-        idx_n = offs_cn[None, :]
-        mask = (idx_m < M) & (idx_n < N)
-        # inductor generates a suffix
-        {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
-        accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-        {% endif %}
+            idx_m = offs_cm[:, None]
+            idx_n = offs_cn[None, :]
+            mask = (idx_m < M) & (idx_n < N)
+            # inductor generates a suffix
+            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
+            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 """