revise flash decoding triton kernel in/out shapes

yuanheng-zhao · yuanheng-zhao · commit fcfbdde25289 · 2024-01-18T17:00:00.000+08:00
diff --git a/colossalai/kernel/triton/flash_decoding.py b/colossalai/kernel/triton/flash_decoding.py
@@ -9,7 +9,7 @@
 # Triton 2.1.0
 @triton.jit
 def _flash_decoding_fwd_kernel(
-    Q,  # [batch_size, head_num, head_dim]
+    Q,  # [batch_size, head_num, q_len(1), head_dim]
     KCache,  # [num_blocks, num_kv_heads, head_dim, block_size]
     VCache,  # [num_blocks, num_kv_heads, head_dim, block_size]
     block_tables,  # [batch_size, max_blocks_per_sequence]
@@ -18,6 +18,7 @@ def _flash_decoding_fwd_kernel(
     kv_seq_len,  # [batch_size]
     stride_qt,
     stride_qh,
+    stride_ql,
     stride_qd,
     stride_cacheb,
     stride_cacheh,
@@ -140,6 +141,7 @@ def _flash_decoding_fwd_reduce_kernel(
     stride_o_lseh,
     stride_o_lseb,
     stride_ob,
+    stride_ol,
     stride_oh,
     stride_od,
     BLOCK_KV: tl.constexpr,
@@ -197,7 +199,7 @@ def flash_decoding_attention(
     Flash decoding implemented with a blocked KV Cache (PagedAttention) during decoding stage.
 
     Args:
-        q (torch.Tensor):       [bsz, 1, num_heads, head_dim]
+        q (torch.Tensor):       [bsz, num_heads, q_len(1), head_dim]
         k_cache (torch.Tensor): [num_blocks, num_kv_heads, head_dim, block_size]
         v_cache (torch.Tensor): [num_blocks, num_kv_heads, head_dim, block_size]
         kv_seq_len (torch.Tensor): [batch_size]
@@ -211,9 +213,9 @@ def flash_decoding_attention(
         num_kv_group (int, optional): Number of key/value groups. Defaults to 1.
 
     Returns:
-        Output tensor with shape [bsz, num_heads, head_dim]
+        Output tensor with shape [bsz, num_heads, q_len, head_dim]
     """
-    bsz, _, num_heads, head_dim = q.shape
+    bsz, num_heads, _, head_dim = q.shape
 
     assert head_dim in {32, 64, 128, 256}
     assert kv_seq_len.shape[0] == block_tables.shape[0] == bsz, (
@@ -234,10 +236,6 @@ def flash_decoding_attention(
     assert block_size in {16, 32, 64, 128}
     BLOCK_KV = block_size
 
-    if q.dim() == 4:
-        assert q.size(1) == 1, f"q_len is supposed to be 1 but is {q.size(1)}"
-        q = q.squeeze(1)
-
     grid = (triton.next_power_of_2(bsz), num_heads, triton.cdiv(triton.next_power_of_2(max_seq_len_in_batch), BLOCK_KV))
     _flash_decoding_fwd_kernel[grid](
         q,
@@ -250,6 +248,7 @@ def flash_decoding_attention(
         q.stride(0),
         q.stride(1),
         q.stride(2),
+        q.stride(3),
         k_cache.stride(0),
         k_cache.stride(1),
         k_cache.stride(2),
@@ -270,8 +269,8 @@ def flash_decoding_attention(
         HEAD_DIM=head_dim,
     )
 
-    output = torch.empty_like(q)
-    output = output.view(-1, output.size(-2), output.size(-1))
+    output = torch.empty_like(q)  # already overlapped
+    output = torch.empty((bsz, 1, num_heads, head_dim), dtype=q.dtype, device=q.device)
 
     grid = (bsz, num_heads)
     _flash_decoding_fwd_reduce_kernel[grid](
@@ -289,6 +288,7 @@ def flash_decoding_attention(
         output.stride(0),
         output.stride(1),
         output.stride(2),
+        output.stride(3),
         BLOCK_KV=block_size,
         HEAD_DIM=head_dim,
     )
diff --git a/tests/test_infer_ops/triton/kernel_utils.py b/tests/test_infer_ops/triton/kernel_utils.py
@@ -21,9 +21,9 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 # src/transformers/models/llama/modeling_llama.py
 # https://github.com/huggingface/transformers/blob/633215ba58fe5114d8c8d32e415a04600e010701/src/transformers/models/llama/modeling_llama.py#L350
 def torch_attn_ref(
-    q: torch.Tensor,  # [bsz, seq_len, num_heads, head_dim]
-    k: torch.Tensor,  # [bsz, kv_seq_len, num_heads, head_dim]
-    v: torch.Tensor,  # [bsz, kv_seq_len, num_heads, head_dim]
+    q: torch.Tensor,  # [bsz, num_heads, q_len, head_dim]
+    k: torch.Tensor,  # [bsz, num_heads, kv_seq_len, head_dim]
+    v: torch.Tensor,  # [bsz, num_heads, kv_seq_len, head_dim]
     attention_mask: torch.Tensor,  # [bsz, 1, seq_len, kv_seq_len]
     bsz: int,
     seq_len: int,
@@ -33,12 +33,6 @@ def torch_attn_ref(
     head_dim: int,
 ):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1] == head_dim
-    q = q.view(bsz, seq_len, num_heads, head_dim)
-    k = k.view(bsz, kv_seq_len, num_kv_heads, head_dim)
-    v = v.view(bsz, kv_seq_len, num_kv_heads, head_dim)
-    q = q.transpose(1, 2)
-    k = k.transpose(1, 2)
-    v = v.transpose(1, 2)
 
     # repeat kv for GQA and MQA
     # k/v won't change if kv_group_num is 1
diff --git a/tests/test_infer_ops/triton/test_context_attn_unpad.py b/tests/test_infer_ops/triton/test_context_attn_unpad.py
@@ -34,9 +34,9 @@ def torch_attn_unpad(
         mask[mask == 0.0] = float("-inf")
 
         torch_attn_ref_out = torch_attn_ref(
-            q[start_idx:end_idx].unsqueeze(0),
-            k[start_idx:end_idx].unsqueeze(0),
-            v[start_idx:end_idx].unsqueeze(0),
+            q[start_idx:end_idx].unsqueeze(0).transpose(1, 2),
+            k[start_idx:end_idx].unsqueeze(0).transpose(1, 2),
+            v[start_idx:end_idx].unsqueeze(0).transpose(1, 2),
             mask,
             1,  # set bsz as 1 as we're processing sequence one by one
             seq_len,
diff --git a/tests/test_infer_ops/triton/test_decoding_attn.py b/tests/test_infer_ops/triton/test_decoding_attn.py
@@ -16,6 +16,9 @@
 
 TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
 
+Q_LEN = 1
+HEAD_DIM = 128
+
 
 def prepare_padding_mask(kv_lengths: torch.Tensor, bsz: int, max_seq_len: int, device="cuda"):
     padding_mask = torch.zeros((bsz, 1, 1, max_seq_len), dtype=torch.float32, device=device)
@@ -48,74 +51,72 @@ def test_flash_decoding(
 
     num_kv_heads = num_attn_heads // kv_group_num
     assert isinstance(num_kv_heads, int) and num_kv_heads > 0, "Invalid number of kv heads."
-    q_len = 1
-    head_dim = 128
     max_seq_len = block_size * max_num_blocks_per_seq
     dtype = torch.float16
     device = get_current_device()
 
+    # Use the provided maximum sequence length for each sequence when testing with teh same context length,
+    # otherwise generate random context lengths.
     context_lengths = (
         torch.tensor([max_seq_len for _ in range(bsz)], dtype=torch.int32, device=device)
         if same_context_len
         else torch.randint(low=1, high=max_seq_len, size=(bsz,), dtype=torch.int32, device=device)
     )
     num_tokens = torch.sum(context_lengths).item()
 
-    q_size = (bsz, q_len, num_attn_heads, head_dim)
-    q = torch.empty(size=q_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
-    q = q.view(bsz, q_len, num_attn_heads, head_dim)
+    q_size = (bsz, Q_LEN, num_attn_heads, HEAD_DIM)
+    q = torch.empty(size=q_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5).transpose(1, 2)
+    kv_unpad_size = (num_tokens, 2 * num_kv_heads, HEAD_DIM)
+    kv_unpad = torch.empty(size=kv_unpad_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
+    k_unpad, v_unpad = torch.split(kv_unpad, [num_kv_heads, num_kv_heads], dim=-2)
 
-    kv_size = (num_tokens, 2 * num_kv_heads, head_dim)
-    kv = torch.empty(size=kv_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
-    k, v = torch.split(kv, [num_kv_heads, num_kv_heads], dim=-2)
-
-    cache_shape = (bsz * max_num_blocks_per_seq, num_kv_heads, head_dim, block_size)
+    cache_shape = (bsz * max_num_blocks_per_seq, num_kv_heads, HEAD_DIM, block_size)
     k_cache = torch.zeros(size=cache_shape, dtype=dtype, device=device)
     v_cache = torch.zeros(size=cache_shape, dtype=dtype, device=device)
     # Mock allocation on block tables as well as blocked kv caches
     block_tables = mock_alloc_block_table_and_kvcache(
-        k, v, k_cache, v_cache, context_lengths, bsz, max_num_blocks_per_seq, block_size
+        k_unpad, v_unpad, k_cache, v_cache, context_lengths, bsz, max_num_blocks_per_seq, block_size
     )
     block_tables = block_tables.to(device=device)
-
-    max_seq_len = context_lengths.max().item()
-    # the maximum block length splitted on kv should be the kv cache block size
-    kv_max_split_num = (max_seq_len + block_size - 1) // block_size
+    # The maximum sequence length in the batch (if context lengths randomly generated)
+    max_seq_len_in_b = context_lengths.max().item()
+    # The maximum block length splitted on kv should be the kv cache block size
+    kv_max_split_num = (max_seq_len_in_b + block_size - 1) // block_size
     mid_output = torch.empty(
-        size=(bsz, num_attn_heads, kv_max_split_num, head_dim), dtype=torch.float32, device=q.device
+        size=(bsz, num_attn_heads, kv_max_split_num, HEAD_DIM), dtype=torch.float32, device=q.device
     )
     mid_output_lse = torch.empty(size=(bsz, num_attn_heads, kv_max_split_num), dtype=torch.float32, device=q.device)
-    sm_scale = 1.0 / (head_dim**0.5)
+    sm_scale = 1.0 / (HEAD_DIM**0.5)
     out_triton = flash_decoding_attention(
         q,
         k_cache,
         v_cache,
         context_lengths,
         block_tables,
-        max_seq_len,
+        max_seq_len_in_b,
         mid_output,
         mid_output_lse,
         block_size=block_size,
         sm_scale=sm_scale,
         kv_group_num=kv_group_num,
-    )
-    out_triton = out_triton.unsqueeze(1)  # [bsz, 1, num_heads, head_dim]
+    )  # [bsz, 1, num_heads, head_dim]
 
     # rebuild (batched) kv with padding for torch attention
-    # q   [bsz, 1, num_heads, head_dim]
-    # k/v [num_tokens, num_kv_heads, head_dim]
-    k_torch = torch.zeros((bsz, max_seq_len, num_kv_heads, head_dim), dtype=k.dtype, device=k.device)
+    # q   [bsz, num_heads, q_len, head_dim]
+    # k/v [bsz, max_seq_len_in_b, num_kv_heads, head_dim]
+    k_torch = torch.zeros((bsz, max_seq_len_in_b, num_kv_heads, HEAD_DIM), dtype=k_unpad.dtype, device=k_unpad.device)
     v_torch = torch.zeros_like(k_torch)
     prev_len_sum = 0
     for i, seq_len in enumerate(context_lengths.tolist()):
-        # mock left-side padding
-        k_torch[i, -seq_len:, :, :] = k[prev_len_sum : prev_len_sum + seq_len]
-        v_torch[i, -seq_len:, :, :] = v[prev_len_sum : prev_len_sum + seq_len]
+        # left-side padding
+        k_torch[i, -seq_len:, :, :] = k_unpad[prev_len_sum : prev_len_sum + seq_len]
+        v_torch[i, -seq_len:, :, :] = v_unpad[prev_len_sum : prev_len_sum + seq_len]
         prev_len_sum += seq_len
-    # k/v [bsz, max_seq_len, num_kv_heads, head_dim]
-    torch_padding_mask = prepare_padding_mask(context_lengths, bsz, k_torch.size(1), q.device)
+    torch_padding_mask = prepare_padding_mask(context_lengths, bsz, max_seq_len_in_b, q.device)
+    k_torch = k_torch.transpose(1, 2)
+    v_torch = v_torch.transpose(1, 2)
     out_torch = torch_attn_ref(
-        q, k_torch, v_torch, torch_padding_mask, bsz, 1, k_torch.size(1), num_attn_heads, num_kv_heads, head_dim
+        q, k_torch, v_torch, torch_padding_mask, bsz, 1, max_seq_len_in_b, num_attn_heads, num_kv_heads, HEAD_DIM
     )
 
     assert out_torch.shape == out_triton.shape
@@ -128,7 +129,7 @@ def test_flash_decoding(
 configs = [
     triton.testing.Benchmark(
         x_names=["KV_LEN"],
-        x_vals=[2**i for i in range(8, 14)],
+        x_vals=[2**i for i in range(8, 12)],
         # x_vals=[x for x in range(256, 8192, 256)],
         line_arg="provider",
         line_vals=["torch", "triton"],
@@ -154,30 +155,28 @@ def bench_kernel(
     rep = 100
 
     num_attn_heads = 16
-    max_num_blocks_per_seq = max(32, triton.cdiv(KV_LEN, block_size))
+    max_num_blocks_per_seq = triton.cdiv(KV_LEN, block_size)
 
     num_kv_heads = num_attn_heads // kv_group_num
     assert isinstance(num_kv_heads, int) and num_kv_heads > 0, "Invalid number of kv heads."
-    q_len = 1
-    head_dim = 128
-    max_seq_len = block_size * max_num_blocks_per_seq
+    block_size * max_num_blocks_per_seq
     dtype = torch.float16
     device = get_current_device()
 
     kv_lengths = (
-        torch.tensor([max_seq_len for _ in range(bsz)], dtype=torch.int32, device=device)
+        torch.tensor([KV_LEN for _ in range(bsz)], dtype=torch.int32, device=device)
         if same_context_len
-        else torch.randint(low=1, high=max_seq_len, size=(bsz,), dtype=torch.int32, device=device)
+        else torch.randint(low=1, high=KV_LEN, size=(bsz,), dtype=torch.int32, device=device)
     )
     num_tokens = torch.sum(kv_lengths).item()
 
-    q_size = (bsz, q_len, num_attn_heads, head_dim)
-    q = torch.empty(size=q_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
-    kv_size = (num_tokens, 2 * num_kv_heads, head_dim)
+    q_size = (bsz, Q_LEN, num_attn_heads, HEAD_DIM)
+    q = torch.empty(size=q_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5).transpose(1, 2)
+    kv_size = (num_tokens, 2 * num_kv_heads, HEAD_DIM)
     kv = torch.empty(size=kv_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
     k, v = torch.split(kv, [num_kv_heads, num_kv_heads], dim=-2)
 
-    cache_shape = (bsz * max_num_blocks_per_seq, num_kv_heads, head_dim, block_size)
+    cache_shape = (bsz * max_num_blocks_per_seq, num_kv_heads, HEAD_DIM, block_size)
     k_cache = torch.zeros(size=cache_shape, dtype=dtype, device=device)
     v_cache = torch.zeros(size=cache_shape, dtype=dtype, device=device)
     # Mock allocation on block tables as well as blocked kv caches
@@ -186,55 +185,54 @@ def bench_kernel(
     )
     block_tables = block_tables.to(device=device)
 
-    q = q.view(bsz, q_len, num_attn_heads, head_dim)
-    max_seq_len = kv_lengths.max().item()  # for random lengths
+    max_seq_len_in_b = kv_lengths.max().item()  # for random lengths
 
     quantiles = [0.5, 0.2, 0.8]
     if provider == "torch":
         # rebuild (batched) kv with padding for torch attention
-        # q   [bsz, 1, num_heads, head_dim]
-        # k/v [num_tokens, num_kv_heads, head_dim]
-        k_torch = torch.zeros((bsz, max_seq_len, num_kv_heads, head_dim), dtype=k.dtype, device=k.device)
+        # q   [bsz, num_heads, q_len, head_dim]
+        # k/v [bsz, max_seq_len_in_b, num_kv_heads, head_dim]
+        k_torch = torch.zeros((bsz, max_seq_len_in_b, num_kv_heads, HEAD_DIM), dtype=k.dtype, device=k.device)
         v_torch = torch.zeros_like(k_torch)
         prev_len_sum = 0
         for i, seq_len in enumerate(kv_lengths.tolist()):
             # mock left-side padding
             k_torch[i, -seq_len:, :, :] = k[prev_len_sum : prev_len_sum + seq_len]
             v_torch[i, -seq_len:, :, :] = v[prev_len_sum : prev_len_sum + seq_len]
             prev_len_sum += seq_len
-        # k/v [bsz, max_seq_len, num_kv_heads, head_dim]
-        torch_padding_mask = prepare_padding_mask(kv_lengths, bsz, k_torch.size(1), q.device)
+        torch_padding_mask = prepare_padding_mask(kv_lengths, bsz, max_seq_len_in_b, q.device)
+        k_torch = k_torch.transpose(1, 2)
+        v_torch = v_torch.transpose(1, 2)
         fn = lambda: torch_attn_ref(
-            q, k_torch, v_torch, torch_padding_mask, bsz, 1, k_torch.size(1), num_attn_heads, num_kv_heads, head_dim
+            q, k_torch, v_torch, torch_padding_mask, bsz, 1, max_seq_len_in_b, num_attn_heads, num_kv_heads, HEAD_DIM
         )
         ms, min_ms, max_ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep, quantiles=quantiles)
     if provider == "triton":
         # the maximum block length splitted on kv should be the kv cache block size
-        kv_max_split_num = (max_seq_len + block_size - 1) // block_size
+        kv_max_split_num = (max_seq_len_in_b + block_size - 1) // block_size
         mid_output = torch.empty(
-            size=(bsz, num_attn_heads, kv_max_split_num, head_dim), dtype=torch.float32, device=q.device
+            size=(bsz, num_attn_heads, kv_max_split_num, HEAD_DIM), dtype=torch.float32, device=q.device
         )
         mid_output_lse = torch.empty(size=(bsz, num_attn_heads, kv_max_split_num), dtype=torch.float32, device=q.device)
-        sm_scale = 1.0 / (head_dim**0.5)
+        sm_scale = 1.0 / (HEAD_DIM**0.5)
         fn = lambda: flash_decoding_attention(
             q,
             k_cache,
             v_cache,
             kv_lengths,
             block_tables,
-            max_seq_len,
+            max_seq_len_in_b,
             mid_output,
             mid_output_lse,
             block_size=block_size,
             sm_scale=sm_scale,
             kv_group_num=kv_group_num,
-        ).unsqueeze(1)
-
+        )  # [bsz, 1, num_heads, head_dim]
         ms, min_ms, max_ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep, quantiles=quantiles)
 
     return ms, min_ms, max_ms
 
 
 if __name__ == "__main__":
-    test_flash_decoding(16, 32, 32, 16, 1, True)
-    # bench_kernel.run(save_path=".", print_data=True)
+    # test_flash_decoding(16, 32, 32, 16, 1, True)
+    bench_kernel.run(save_path=".", print_data=True)