Add a wrapper for hgemm kernel (#69)

diptorupd · web-flow · commit 622b0e88c196 · 2025-08-12T11:19:29.000+05:30
Adds a common wrapper function to mma_ops.hpp for hgemm kernels that works for both CUDA and HIP. Replaces
`mma_sync_m16n16k16_row_col_f16f16f32`
diff --git a/libflashinfer/include/gpu_iface/backend/hip/mma_hip.h b/libflashinfer/include/gpu_iface/backend/hip/mma_hip.h
@@ -112,11 +112,10 @@ load_fragment_transpose(uint32_t *R, const T *smem_ptr, uint32_t stride)
            static_cast<const uint32_t>(*v3);
 }
 
-// MMA operation for FP16 inputs with FP32 accumulator
 // MMA operation for FP16 inputs with FP32 accumulator
 template <typename T, mma::MMAMode mma_mode = mma::MMAMode::kInplaceUpdate>
 __device__ __forceinline__ void
-amdgcn_mfma_fp32_16x16x16fp16(float *C, uint32_t *A, uint32_t *B)
+mma_sync_m16n16k16_row_col_f16f16f32(float *C, uint32_t *A, uint32_t *B)
 {
     // Ensure T is either __half or __hip_bfloat16
     static_assert(std::is_same_v<T, __half> ||
diff --git a/libflashinfer/include/gpu_iface/mma_ops.hpp b/libflashinfer/include/gpu_iface/mma_ops.hpp
@@ -9,10 +9,10 @@
 // Include platform-specific implementations
 #if defined(PLATFORM_CUDA_DEVICE)
 #include "backend/cuda/mma.cuh"
-namespace detail = flashinfer::gpu_iface::mma_impl::cuda;
+namespace mma_detail = flashinfer::gpu_iface::mma_impl::cuda;
 #elif defined(PLATFORM_HIP_DEVICE)
 #include "backend/hip/mma_hip.h"
-namespace detail = flashinfer::gpu_iface::mma_impl::hip;
+namespace mma_detail = flashinfer::gpu_iface::mma_impl::hip;
 #endif
 
 namespace flashinfer
@@ -34,14 +34,14 @@ namespace mma
 template <typename T>
 __device__ __forceinline__ void load_fragment(uint32_t *R, const T *smem_ptr)
 {
-    detail::load_fragment<T>(R, smem_ptr);
+    mma_detail::load_fragment<T>(R, smem_ptr);
 }
 
 template <typename T>
 __device__ __forceinline__ void
 load_fragment_transpose(uint32_t *R, const T *smem_ptr, uint32_t stride)
 {
-    detail::load_fragment_transpose<T>(R, smem_ptr, stride);
+    mma_detail::load_fragment_transpose<T>(R, smem_ptr, stride);
 }
 
 #if defined(PLATFORM_HIP_DEVICE) && defined(__gfx942__)
@@ -51,13 +51,14 @@ load_fragment_transpose_4x4_half_registers(uint32_t *R, const T *smem_ptr)
 {
     static_assert(std::is_same<T, int>::value,
                   "Only __half is supported for the 4x4 register transpose");
-    detail::load_fragment_4x4_half_registers<half>(R, smem_ptr);
+    mma_detail::load_fragment_4x4_half_registers<half>(R, smem_ptr);
 }
 #endif
 
 /*!
- * \brief Wrapper of two mma m16n16k16 instructions for row major and column
- * major f16 matrix multiplication, accumulated in f32.
+ * \brief An m16n16k16 gemm kernel using MMA instructions for CUDA/HIP for row
+ * major and column major f16 matrix multiplication, accumulated in f32.
+ *
  * \tparam T data type of the fragment
  * \tparam mma_mode whether we are initializing the accumulator or updating it
  * \param C pointer to the accumulator
@@ -66,32 +67,17 @@ load_fragment_transpose_4x4_half_registers(uint32_t *R, const T *smem_ptr)
  */
 template <typename T, MMAMode mma_mode = MMAMode::kInplaceUpdate>
 __device__ __forceinline__ void
-amdgcn_mfma_fp32_16x16x16fp16(float *C, uint32_t *A, uint32_t *B)
+mma_sync_m16n16k16_row_col_f16f16f32(float *C, uint32_t *A, uint32_t *B)
 {
-#if defined(PLATFORM_HIP_DEVICE)
-    detail::amdgcn_mfma_fp32_16x16x16fp16<T, mma_mode>(C, A, B);
-#else
-    FLASHINFER_RUNTIME_ASSERT(
-        "MMA f16f16f32 not supported on this architecture");
-#endif
+    mma_detail::mma_sync_m16n16k16_row_col_f16f16f32<T, mma_mode>(C, A, B);
 }
 
 template <typename DType>
 __device__ __forceinline__ void m16k16_rowsum_f16f16f32(float *d, DType *s)
 {
-    detail::m16k16_rowsum_f16f16f32<DType>(d, s);
+    mma_detail::m16k16_rowsum_f16f16f32<DType>(d, s);
 }
 
-// /*!
-//  * \brief Use mma instructions to compute rowsum.
-//  */
-// template <typename DType>
-// __device__ __forceinline__ void
-// m16k16_rowsum_f16f16f32(float* d, DType* s)
-// {
-//     detail::m16k16_rowsum_f16f16f32(d, s);
-// }
-
 } // namespace mma
 } // namespace gpu_iface
 } // namespace flashinfer
diff --git a/libflashinfer/tests/hip/test_mfma_fp32_16x16x16fp16.cpp b/libflashinfer/tests/hip/test_mfma_fp32_16x16x16fp16.cpp
@@ -81,7 +81,7 @@ __global__ void test_mfma_kernel(const __half *A, const __half *B, float *C)
     flashinfer::gpu_iface::mma::load_fragment_transpose<__half>(b_reg,
                                                                 &B[b_idx], LDB);
 
-    flashinfer::gpu_iface::mma::amdgcn_mfma_fp32_16x16x16fp16<__half>(
+    flashinfer::gpu_iface::mma::mma_sync_m16n16k16_row_col_f16f16f32<__half>(
         c_reg, a_reg, b_reg);
 
     for (int i = 0; i < 4; ++i) {

Original file line number	Diff line number	Diff line change
`@@ -112,11 +112,10 @@ load_fragment_transpose(uint32_t R, const T smem_ptr, uint32_t stride)`
`112`	`112`	`static_cast<const uint32_t>(*v3);`
`113`	`113`	`}`
`114`	`114`
`115`		`-// MMA operation for FP16 inputs with FP32 accumulator`
`116`	`115`	`// MMA operation for FP16 inputs with FP32 accumulator`
`117`	`116`	`template <typename T, mma::MMAMode mma_mode = mma::MMAMode::kInplaceUpdate>`
`118`	`117`	`__device__ __forceinline__ void`
`119`		`-amdgcn_mfma_fp32_16x16x16fp16(float C, uint32_t A, uint32_t *B)`
	`118`	`+mma_sync_m16n16k16_row_col_f16f16f32(float C, uint32_t A, uint32_t *B)`
`120`	`119`	`{`
`121`	`120`	`// Ensure T is either __half or __hip_bfloat16`
`122`	`121`	`static_assert(std::is_same_v<T, __half> \|\|`