Fix ROCm global load inline assembly in Marlin sparse kernel

petrex · petrex · commit 10c8823a7d4f · 2025-03-11T13:54:55.000-07:00
Modify the cp_async4 functions to use the correct extern declaration for __builtin_amdgcn_global_load_lds on ROCm platforms. This ensures proper inline assembly and cross-platform compatibility for the Marlin sparse kernel's memory loading operations.
diff --git a/torchao/csrc/cuda/sparse_marlin/mem.h b/torchao/csrc/cuda/sparse_marlin/mem.h
@@ -51,7 +51,7 @@ __device__ inline void cp_async4_pred_zfill(void* smem_ptr,
   int src_in_bytes = (zfill ? 0 : BYTES);
   uint32_t smem = cvta_to_shared(smem_ptr);
   #ifdef USE_ROCM
-  __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
+  extern __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
   #else
   asm volatile(
       "{\n"
@@ -68,7 +68,7 @@ __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
   const int BYTES = 16;
   uint32_t smem = cvta_to_shared(smem_ptr);
   #ifdef USE_ROCM
-  __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
+  extern __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
   #else
   asm volatile(
       "{\n"
@@ -85,7 +85,7 @@ __device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
   const int BYTES = 16;
   uint32_t smem = cvta_to_shared(smem_ptr);
   #ifdef USE_ROCM
-  __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
+  extern __builtin_amdgcn_global_load_lds(static_cast<const uint32_t*>(glob_ptr), &smem, BYTES, 0, 0);
   #else
   asm volatile(
       "{\n"