minor change on the ptr arith

ROCm · micmelesse · Feb 4, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 18, 2024
commit 1c542d2a141a6c7a7e5164f89fd8e7f3b60ce91e
diff --git a/flash_attn/flash_attn_triton_amd/bwd_prefill_split.py b/flash_attn/flash_attn_triton_amd/bwd_prefill_split.py
@@ -125,7 +125,7 @@ def _attn_bwd_dkdv(
 
         # Load m before computing qk to reduce pipeline stall.
         offs_m = curr_m + tl.arange(0, BLOCK_M1)
-        m = tl.load(M + offs_m, mask=offs_m < seqlen_q, other=0.0)
+        m = tl.load(M + offs_m, mask=offs_m < seqlen_q)
         qkT = tl.dot(k, qT)
         pT = tl.math.exp2(qkT - m[None, :])
         # Autoregressive masking.
@@ -141,7 +141,7 @@ def _attn_bwd_dkdv(
         ppT = ppT.to(tl.float16)
         dv += tl.dot(ppT, do)
         # D (= delta) is pre-divided by ds_scale.
-        Di = tl.load(D + offs_m, mask=offs_m < seqlen_q, other=0.0)
+        Di = tl.load(D + offs_m, mask=offs_m < seqlen_q)
         # Compute dP and dS.
         dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
         if DROPOUT:
@@ -153,8 +153,9 @@ def _attn_bwd_dkdv(
         curr_m += step_m
         qT_ptrs += step_m * stride_qm
         do_ptrs += step_m * stride_qm
-        curr_dropout_offset += step_m * stride_qm
-        curr_philox_offset += step_m * stride_qm
+        if DROPOUT:
+            curr_dropout_offset += step_m * stride_qm
+            curr_philox_offset += step_m * stride_qm
     return dk, dv
 
 
@@ -241,7 +242,7 @@ def _attn_bwd_dq(dq,  # output
 # num_pid = max(
 #         tl.cdiv(max_seqlen_k // BLOCK_N1),
 #         tl.cdiv(max_seqlen_q // BLOCK_M2))
-# grid = (num_pid, 1, batch * nheads_q)
+# grid = (num_pid, batch * nheads_q)
 @triton.jit
 def _bwd_kernel(
     Q, K, V, sm_scale, Out, DO, DQ, DK, DV,
@@ -270,7 +271,7 @@ def _bwd_kernel(
     LN2: tl.constexpr = 0.6931471824645996  # = ln(2)
     # program ids
     pid = tl.program_id(0)
-    bhqid = tl.program_id(2)
+    bhqid = tl.program_id(1)
     bid = bhqid // HQ
     hqid = bhqid % HQ
 
@@ -302,21 +303,24 @@ def _bwd_kernel(
     # If MQA / GQA, set the K and V head offsets appropriately.
     GROUP_SIZE = HQ // HK
     if GROUP_SIZE != 1:
-        off_hk = hqid // GROUP_SIZE
+        hkid = hqid // GROUP_SIZE
     else:
-        off_hk = hqid
+        hkid = hqid
 
     # input tensor offsets
-    Q +=  bid * stride_qb + hqid * stride_qh + q_start * stride_qm
-    K +=  bid * stride_kb + off_hk * stride_kh + k_start * stride_kn
-    V +=  bid * stride_vb + off_hk * stride_vh + k_start * stride_vn
-    DO +=  bid * stride_qb + hqid * stride_qh + q_start * stride_qm
-    M +=  bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam
-    Delta +=  bid * stride_deltab + hqid * stride_deltah + q_start * stride_deltam
+    adj_q = bid * stride_qb + hqid * stride_qh + q_start * stride_qm
+    adj_kv = bid * stride_kb + hkid * stride_kh + k_start * stride_kn
+    adj_delta = bhqid * stride_deltab + q_start * stride_deltam
+    Q +=  adj_q
+    K +=  adj_kv
+    V +=  adj_kv
+    DO +=  adj_q
+    M +=  adj_delta
+    Delta +=  adj_delta
     # output tensor offsets
-    DQ += bid * stride_qb + hqid * stride_qh + q_start * stride_qm
-    DK += bid * stride_kb + off_hk * stride_kh + k_start * stride_kn
-    DV += bid * stride_vb + off_hk * stride_vh + k_start * stride_vn
+    DQ += adj_q
+    DK += adj_kv
+    DV += adj_kv
 
     # dropout is a boolean mask that will clear out the multiplicant tensor
     # wherever the dropout's entry is 0. It is generated by the tl.rand(seed,
@@ -359,10 +363,9 @@ def _bwd_kernel(
             mask_k = offs_k < ACTUAL_HEAD_DIM
             mask_kv &= mask_k[None, :]
         # load K and V: they stay in SRAM throughout the inner loop.
-        k_ptrs = K + offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk
-        v_ptrs = V + offs_n[:, None] * stride_vn + offs_k[None, :] * stride_vk
-        k = tl.load(k_ptrs, mask=mask_kv, other=0.0)
-        v = tl.load(v_ptrs, mask=mask_kv, other=0.0)
+        offs_kv = offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk
+        k = tl.load(K + offs_kv, mask=mask_kv, other=0.0)
+        v = tl.load(V + offs_kv, mask=mask_kv, other=0.0)
 
         MASK_BLOCK_M1: tl.constexpr = BLOCK_M1 // BLK_SLICE_FACTOR
         num_steps = BLOCK_N1 // MASK_BLOCK_M1
@@ -390,7 +393,7 @@ def _bwd_kernel(
         else:
             start_m = 0
 
-        num_steps = tl.cdiv(seqlen_q - start_m, BLOCK_M1)
+        num_steps = (seqlen_q - start_m) // BLOCK_M1
         # only the blocks on the causal mask diagonal needs to mask
         dk, dv = _attn_bwd_dkdv(
             dk, dv,  # output tensors
@@ -407,11 +410,9 @@ def _bwd_kernel(
         )
 
         # Write back dV and dK.
-        dv_ptrs = DV + offs_n[:, None] * stride_vn + offs_k[None, :] * stride_vk
-        tl.store(dv_ptrs, dv, mask=mask_kv)
+        tl.store(DV + offs_kv, dv, mask=mask_kv)
         dk *= sm_scale
-        dk_ptrs = DK + offs_n[:, None] * stride_vn + offs_k[None, :] * stride_vk
-        tl.store(dk_ptrs, dk, mask=mask_kv)
+        tl.store(DK + offs_kv, dk, mask=mask_kv)
 
     # THIS BLOCK DOES DQ:
     if pid < num_qblocks:
@@ -420,8 +421,9 @@ def _bwd_kernel(
         # TODO: now pid is only a function of max_seqlen_k, so it's incorrect for the
         start_m = pid * BLOCK_M2
         # seqlen_q > seqlen_k, no need to process these tile for dq
-        if start_m + BLOCK_M2 < seqlen_delta:
-            return
+        # TODO: fix this
+        # if start_m + BLOCK_M2 < seqlen_delta:
+        #     return
         end_n = start_m + BLOCK_M2
         # when seqlen_q < seqlen_k, the end_n is padded
         end_n += seqlen_delta
@@ -439,7 +441,7 @@ def _bwd_kernel(
         dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
         do = tl.load(do_ptrs, mask=mask_q, other=0.0)
 
-        m = tl.load(M + offs_m, mask=offs_m < seqlen_q, other=0.0)
+        m = tl.load(M + offs_m, mask=offs_m < seqlen_q)
         m = m[:, None]
 
         # Compute dQ for masked (diagonal) blocks.
@@ -584,7 +586,7 @@ def attention_prefill_backward_triton_split_impl(
     num_pid = max(
         (max_seqlen_k + BLOCK_N1 - 1) // BLOCK_N1,
         (max_seqlen_q + BLOCK_M2 - 1) // BLOCK_M2)
-    grid = (num_pid, 1, batch * nheads_q)
+    grid = (num_pid, batch * nheads_q)
     _bwd_kernel[grid](
         q, k, v, sm_scale, o, do, dq, dk, dv,
         softmax_lse, delta,