Enable autotuning for SYCL CUTLASS

sommerlukas · sommerlukas · commit e21c49d2ed95 · 2025-04-30T11:20:37.000+01:00
Enable autotuning for SYCL CUTLASS by completing
the SYCL benchmark request class.

Also adds a temporary workaround to allow bf16 GEMM
to accumulate in FP32 in code paths used when
auto-tuning is active.

Signed-off-by: Lukas Sommer &lt;lukas.sommer@codeplay.com&gt;
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -889,8 +889,7 @@ def get_tuning_process_pool() -> TuningProcessPool:
 
 class SYCLBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest):
     # Important: Instances of this class have to be serializable
-    # across process boundaries. Do not put Tensors in here!
-    # TODO (SYCL) : Complete the bmrq class to enable full autotuning
+    # across process boundaries. Do not put device tensors in here!
     def __init__(
         self,
         kernel_name: str,
diff --git a/torch/_inductor/codegen/xpu/sycl_kernel.py b/torch/_inductor/codegen/xpu/sycl_kernel.py
@@ -468,8 +468,8 @@ def precompile(self) -> None:
         self.bmreq.precompile()
 
     def benchmark(self, *args, out) -> float:
-        # TODO (SYCL) : Enable benchmarking once supported
-        return 0.001
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(*args, output_tensor=out)
 
     def __str__(self) -> str:
         return f"SYCLTemplateCaller(source_file={self.bmreq.source_file})"
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -2715,7 +2715,10 @@ def replace_operation_buffer(
                 out_buffer = out_storage.data
                 assert isinstance(out_buffer, ir.OperationBuffer)
 
-                out_buffer.layout = multi_node.layout
+                # TODO (SYCL): This is a temporary hack to allow auto-tuning
+                # while our CUTLASS does not support bf16 accumulation for
+                # GEMM. Uncomment this line when it is supported.
+                #out_buffer.layout = multi_node.layout
                 replace_operation_buffer(multi_node, out_buffer)
                 new_scheduler_node = self.create_scheduler_node(out_buffer)