Merge remote-tracking branch 'upstream/main' into merge-ci-files

XuehaiPan · XuehaiPan · commit 01eaba267416 · 2025-10-15T21:23:05.000+08:00
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
@@ -34,7 +34,7 @@ def get_configs():
     block_N = [32, 64, 128, 256]
     threads = [128, 256, 512]
     num_split_q = [64, 128, 256]
-    num_stages = [0]
+    num_stages = [0, 1]
     enable_rasterization = [True]
     k_pack = [2]
     panel_size = [7, 8]
@@ -60,18 +60,6 @@ def get_configs():
             "qk_coalesced_width": qkw,
             "v_coalesced_width": vw,
         })
-    valid_configs.append({
-        'block_M': 64,
-        'block_N': 64,
-        'num_split_q': 64,
-        'threads': 256,
-        'num_stages': 1,
-        'enable_rasterization': True,
-        'k_pack': 2,
-        'panel_size': 64,
-        'qk_coalesced_width': 8,
-        'v_coalesced_width': 8,
-    })
     return valid_configs
 
 
@@ -95,7 +83,7 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5 * 1.44269504
+    scale = (1.0 / dim)**0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
@@ -185,15 +173,15 @@ def main(
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
 
                     for i in T.Parallel(block_M):
-                        sf = T.exp2(m_prev[i] * scale - m_i[i] * scale)
+                        sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
                         l_i[i] *= sf
                         scale_factor[i] = sf
 
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] *= scale_factor[i]
 
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - m_i[i] * scale)
+                        acc_s[i, j] = T.exp(acc_s[i, j] * scale - m_i[i] * scale)
 
                     T.reduce_sum(acc_s, row_sum, dim=1)
                     for i in T.Parallel(block_M):
diff --git a/examples/amd/test.sh b/examples/amd/test.sh
diff --git a/examples/flash_attention/example_mha_bwd.py b/examples/flash_attention/example_mha_bwd.py
@@ -38,14 +38,10 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
             T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            # T.copy(Q_shared, Q_local)
-            # for i, j in T.Parallel(block_M, dim):
-            #     Q_local[i, j] *= scale
             loop_range = (
                 T.ceildiv(
                     (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
@@ -192,9 +188,6 @@ def flash_bwd(
 
             T.annotate_layout({
                 dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
             })
             T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
             T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
@@ -41,10 +41,18 @@ static std::string GetFP8Type(DataType type) {
     stream << "fp8_e4" << vec << "_t";
   } else if (type.code() == DataType::kFloat8_e4m3fnuz) {
     stream << "fp8_e4" << vec << "_t";
+  } else if (type.code() == DataType::kFloat8_e4m3) {
+    stream << "fp8_e4" << vec << "_t";
+  } else if (type.code() == DataType::kFloat8_e4m3b11fnuz) {
+    stream << "fp8_e4" << vec << "_t";
   } else if (type.code() == DataType::kFloat8_e5m2) {
     stream << "fp8_e5" << vec << "_t";
+  } else if (type.code() == DataType::kFloat8_e5m2fnuz) {
+    stream << "fp8_e5" << vec << "_t";
+  } else if (type.code() == DataType::kFloat8_e8m0fnu) {
+    stream << "fp8_e8" << vec << "_t";
   } else {
-    LOG(FATAL) << "Unsupported FP8 type in HIP codegen";
+    LOG(FATAL) << "Unsupported FP8 type in HIP codegen: " << type;
   }
   return stream.str();
 }
@@ -926,10 +934,10 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
         {"float8_e4m3fnuzx8", "long"},
         {"float32x16", "float32x16"}};
     std::string call_mfma_code = R"({
-    *((({C_dtype}*){c_ref}) + {c_bias}) = {mfma_buildin}(*((({A_dtype}*){a_ref}) + {a_bias}),
-                  *((({B_dtype}*){b_ref}) + {b_bias}),
-                  *((({C_dtype}*){c_ref}) + {c_bias}), 0, 0, 0);
-  })";
+      *((({C_dtype}*){c_ref}) + {c_bias}) = {mfma_buildin}(*((({A_dtype}*){a_ref}) + {a_bias}),
+                    *((({B_dtype}*){b_ref}) + {b_bias}),
+                    *((({C_dtype}*){c_ref}) + {c_bias}), 0, 0, 0);
+    })";
     std::string mfma_buildin = "__builtin_amdgcn_mfma_" + prefix;
     Replacer replacer;
 
@@ -955,6 +963,13 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
                           op->args, true, os);
   } else if (op->op.same_as(tl::tl_gemm_sp())) {
     LOG(FATAL) << "tl_gemm_sp is not supported on HIP";
+  } else if (op->op.same_as(tl::loop_break())) {
+    this->PrintIndent();
+    this->stream << "break;\n";
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    // HIP doesn't need explicit register management like CUDA
+    // This is a no-op for HIP
+    return;
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -1160,7 +1175,8 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
     os << "bfloat16_t";
     os << '(' << std::scientific << op->value << 'f' << ')';
     return;
-  } else if (op->dtype.is_float8_e4m3fnuz()) {
+  } else if (op->dtype.is_float8_e4m3fnuz() || op->dtype.is_float8_e4m3() ||
+             op->dtype.is_float8_e4m3fn()) {
     os << "fp8_e4_t";
     os << '(' << std::scientific << op->value << 'f' << ')';
     return;
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
@@ -109,3 +109,13 @@ template <typename T1, typename T2>
 TL_DEVICE void AtomicAdd(T1 *address, T2 val) {
   atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
 }
+
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+TL_DEVICE void AtomicAdd(T1 address, T2 val) {
+  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+template <typename T1, typename T2> TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val) {
+  return atomicAdd(&ref, static_cast<T1>(val));
+}
diff --git a/src/tl_templates/hip/gemm.h b/src/tl_templates/hip/gemm.h
@@ -70,7 +70,9 @@ template <int M, int N, int K, int num_warp_m, int num_warp_n, bool TransposeA,
           typename B_type, typename C_type, typename AccDataType = float>
 class GemmTensorOp {
 public:
-  static_assert(!clear_accum, "clear_accum=true is not supported yet");
+  // Note: clear_accum=true is not fully supported in HIP implementation
+  // but we'll handle it by manually clearing the accumulator
+  // static_assert(!clear_accum, "clear_accum=true is not supported yet");
 
   static constexpr int micro_size_x = 16;
   static constexpr int micro_size_y = 16;
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
@@ -5,6 +5,13 @@
 using fp8_e4_t = __hip_fp8_e4m3_fnuz;
 using fp8_e4_2_t = __hip_fp8x2_e4m3_fnuz;
 
+// Additional FP8 types for compatibility
+using fp8_e5_t = __hip_fp8_e5m2_fnuz;
+using fp8_e5_2_t = __hip_fp8x2_e5m2_fnuz;
+// Note: E8M0 types are not supported in current HIP version
+// using fp8_e8_t = __hip_fp8_e8m0_fnuz;
+// using fp8_e8_2_t = __hip_fp8x2_e8m0_fnuz;
+
 // Simple wrapper that provides member access for generated code
 struct fp8_e4_4_t {
   union {
@@ -43,6 +50,54 @@ struct __align__(16) fp8_e4_16_t {
   fp8_e4_8_t y;
 };
 
+// FP8 E5M2 vector types
+struct fp8_e5_4_t {
+  union {
+    __hip_fp8x4_e5m2_fnuz data;
+    struct {
+      fp8_e5_t x, y, z, w;
+    };
+  };
+  __device__ fp8_e5_4_t() = default;
+  __device__ fp8_e5_4_t(const __hip_fp8x4_e5m2_fnuz &val) : data(val) {}
+  __device__ operator __hip_fp8x4_e5m2_fnuz() const { return data; }
+};
+
+struct __align__(8) fp8_e5_8_t {
+  fp8_e5_4_t x;
+  fp8_e5_4_t y;
+};
+
+struct __align__(16) fp8_e5_16_t {
+  fp8_e5_8_t x;
+  fp8_e5_8_t y;
+};
+
+// FP8 E8M0 vector types - not supported in current HIP version
+/*
+struct fp8_e8_4_t {
+  union {
+    __hip_fp8x4_e8m0_fnuz data;
+    struct {
+      fp8_e8_t x, y, z, w;
+    };
+  };
+  __device__ fp8_e8_4_t() = default;
+  __device__ fp8_e8_4_t(const __hip_fp8x4_e8m0_fnuz &val) : data(val) {}
+  __device__ operator __hip_fp8x4_e8m0_fnuz() const { return data; }
+};
+
+struct __align__(8) fp8_e8_8_t {
+  fp8_e8_4_t x;
+  fp8_e8_4_t y;
+};
+
+struct __align__(16) fp8_e8_16_t {
+  fp8_e8_8_t x;
+  fp8_e8_8_t y;
+};
+*/
+
 __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
                                       fp8_e4_t w) {
   // reinterpret the 4 fp8_e4_t values to signed char value and shift
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -238,6 +238,12 @@ def test_assert_tl_matmul():
         128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
     assert_tl_matmul_correctness(
         128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
+    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
+    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
+    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
+    assert_tl_matmul_correctness(
+        128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":