Enable bf16 accumulation

sommerlukas · sommerlukas · commit d76676db7f73 · 2025-05-05T15:46:14.000+01:00
Signed-off-by: Lukas Sommer &lt;lukas.sommer@codeplay.com&gt;
diff --git a/torch/_inductor/codegen/xpu/cutlass_utils.py b/torch/_inductor/codegen/xpu/cutlass_utils.py
@@ -203,7 +203,7 @@ def get_accumulator_dtype(
         return None
 
     if all(dtype == torch.bfloat16 for dtype in input_torch_dtypes):
-        return torch.float
+        return torch.bfloat16
     else:
         raise NotImplementedError(f"Unsupported data types: {input_torch_dtypes}")
 
diff --git a/torch/_inductor/codegen/xpu/gemm_template.py b/torch/_inductor/codegen/xpu/gemm_template.py
@@ -652,11 +652,7 @@ def __init__(
         beta: float,
         input_reorder: Optional[list[int]] = None,
     ):
-        # TODO (SYCL) : This is a workaround hardcoding output type (layout) to float32
-        # Should be removed once not limited to the bfloat input->float32 accum cutlass configurations
-        float_layout = copy.deepcopy(layout)
-        float_layout.dtype = float32
-        super().__init__(input_nodes, float_layout, alpha, beta, input_reorder)
+        super().__init__(input_nodes, layout, alpha, beta, input_reorder)
 
     @staticmethod
     def add_cutlass_gemm_choices(
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -2715,10 +2715,7 @@ def replace_operation_buffer(
                 out_buffer = out_storage.data
                 assert isinstance(out_buffer, ir.OperationBuffer)
 
-                # TODO (SYCL): This is a temporary hack to allow auto-tuning
-                # while our CUTLASS does not support bf16 accumulation for
-                # GEMM. Uncomment this line when it is supported.
-                #out_buffer.layout = multi_node.layout
+                out_buffer.layout = multi_node.layout
                 replace_operation_buffer(multi_node, out_buffer)
                 new_scheduler_node = self.create_scheduler_node(out_buffer)