Enable autotuning and bf16 accumulation for SYCL CUTLASS (#4)

sommerlukas · web-flow · commit bc53ae61a141 · 2025-05-16T10:53:53.000+02:00
Enable autotuning for SYCL CUTLASS by completing the SYCL benchmark request class. Also removes a temporary workaround that forced float32 accumulation to now allow GEMM to accumulate in bfloat16. This addresses one of the items left open in #2. --------- Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
diff --git a/third_party/cutlass b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit 9fd9dc355918ec5a3690cb901b50ea739fe4972b
+Subproject commit a1fed829371d3c4d6fe3144c08af969a90ff4254
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -889,8 +889,7 @@ def get_tuning_process_pool() -> TuningProcessPool:
 
 class SYCLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
     # Important: Instances of this class have to be serializable
-    # across process boundaries. Do not put Tensors in here!
-    # TODO (SYCL) : Complete the bmrq class to enable full autotuning
+    # across process boundaries. Do not put device tensors in here!
     def __init__(
         self,
         kernel_name: str,
diff --git a/torch/_inductor/codegen/xpu/cutlass_utils.py b/torch/_inductor/codegen/xpu/cutlass_utils.py
@@ -203,7 +203,7 @@ def get_accumulator_dtype(
         return None
 
     if all(dtype == torch.bfloat16 for dtype in input_torch_dtypes):
-        return torch.float
+        return torch.bfloat16
     else:
         raise NotImplementedError(f"Unsupported data types: {input_torch_dtypes}")
 
diff --git a/torch/_inductor/codegen/xpu/gemm_template.py b/torch/_inductor/codegen/xpu/gemm_template.py
@@ -652,11 +652,7 @@ def __init__(
         beta: float,
         input_reorder: Optional[list[int]] = None,
     ):
-        # TODO (SYCL) : This is a workaround hardcoding output type (layout) to float32
-        # Should be removed once not limited to the bfloat input->float32 accum cutlass configurations
-        float_layout = copy.deepcopy(layout)
-        float_layout.dtype = float32
-        super().__init__(input_nodes, float_layout, alpha, beta, input_reorder)
+        super().__init__(input_nodes, layout, alpha, beta, input_reorder)
 
     @staticmethod
     def add_cutlass_gemm_choices(
@@ -780,14 +776,30 @@ def _set_bias_layout_and_alignment(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
     ) -> bool:
+        import cutlass_library.library as cutlass_lib
+
         has_bias = len(self.input_nodes) >= 3 and self.input_nodes[2] is not None
         if has_bias:
             bias = self.input_nodes[2]
+            # Bias data type
+            op.C.element = cutlass_utils.torch_dtype_to_cutlass_type(
+                bias.get_layout().dtype
+            )
+            assert op.C.element == op.D.element, (
+                f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
+            )
+
+            # Bias layout
             bias_layout = CUTLASSGemmTemplate.cutlass_layout(bias.get_layout())
             op.C.layout = bias_layout
+
+            # Bias alignment
             status = self.set_alignment(bias.get_layout(), op.C)
             if not status:
                 return False
+
+        else:
+            op.C.element = cutlass_lib.DataType.void
         return True
 
     def _dtype_match(
diff --git a/torch/_inductor/codegen/xpu/sycl_kernel.py b/torch/_inductor/codegen/xpu/sycl_kernel.py
@@ -467,8 +467,8 @@ def precompile(self) -> None:
         self.bmreq.precompile()
 
     def benchmark(self, *args, out) -> float:
-        # TODO (SYCL) : Enable benchmarking once supported
-        return 0.001
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(*args, output_tensor=out)
 
     def __str__(self) -> str:
         return f"SYCLTemplateCaller(source_file={self.bmreq.source_file})"