codeplaysoftware · sommerlukas · May 5, 2025 · Apr 30, 2025
diff --git a/torch/_inductor/codegen/xpu/gemm_template.py b/torch/_inductor/codegen/xpu/gemm_template.py
@@ -42,9 +42,7 @@
   using coord_t = cutlass::gemm::GemmCoord::Index;
   static cutlass::KernelHardwareInfo hw_info;
 
-  // TODO (SYCL) : device_id here is only used for hw info and doesn't necessarly mean
-  // it's linked to the SYCL queue. It's hardcoded to 0 in the CUDA version as well.
-  const int device_id = 0;
+  const int device_id = syclcompat::get_device_id(stream->get_device());
 
   if (hw_info.sm_count == 0) {
     hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(device_id);
@@ -75,15 +73,11 @@
 #endif
 #endif
   {
-    // TODO (SYCL): Pass the SYCL queue (currently last arg of `kernel_call_signature` above)
-    // once supported on CUTLASS side. Variable name to respect the naming in: _EXTRA_CPP_ARGS (sycl_kernel.py)
-    auto status = gemm_op.initialize(arguments, workspace);
+    auto status = gemm_op.initialize(arguments, workspace, stream);
     CUTLASS_CHECK(status);
   }
   {
-    // TODO (SYCL): Pass the SYCL queue once supported on CUTLASS side.
-    // Variable name to respect the naming in: _EXTRA_CPP_ARGS (sycl_kernel.py)
-    auto status = gemm_op.run();
+    auto status = gemm_op.run(stream);
     CUTLASS_CHECK(status);
     syclcompat::wait_and_throw();
   }

diff --git a/torch/_inductor/codegen/xpu/sycl_kernel.py b/torch/_inductor/codegen/xpu/sycl_kernel.py
@@ -154,8 +154,7 @@ class SYCLTemplateKernel(SYCLKernel):
     Template kernels defined by SYCL / Cutlass in C++.
     """
 
-    # TODO (SYCL): The SYCL queue is not being used
-    _EXTRA_CPP_ARGS = "size_t* workspace_size, uint8_t* workspace, sycl::queue stream"
+    _EXTRA_CPP_ARGS = "size_t* workspace_size, uint8_t* workspace, sycl::queue* stream"
 
     def __init__(
         self,