fix calling of triton kernel in modeling

yuanheng-zhao · yuanheng-zhao · commit 0a6ea7d247a4 · 2024-01-18T18:13:59.000+08:00
diff --git a/colossalai/inference/modeling/models/llama.py b/colossalai/inference/modeling/models/llama.py
@@ -6,7 +6,7 @@
 
 from colossalai.inference.modeling.layers.attention import PagedAttention
 from colossalai.inference.struct import BatchInfo
-from colossalai.kernel.triton import context_attention_unpadded, copy_kv_to_blocked_cache, flash_decoding_fwd
+from colossalai.kernel.triton import context_attention_unpadded, copy_kv_to_blocked_cache, flash_decoding_attention
 from colossalai.logging import get_dist_logger
 
 from flash_attn.bert_padding import index_first_axis, pad_input  # noqa
@@ -209,7 +209,15 @@ def llama_attn_forward(
         if HAS_TRITON:
             copy_kv_to_blocked_cache(key_states, k_cache, kv_lengths=sequence_lengths, block_tables=block_tables)
             copy_kv_to_blocked_cache(value_states, v_cache, kv_lengths=sequence_lengths, block_tables=block_tables)
-            attn_output = flash_decoding_fwd(query_states, k_cache, v_cache, sequence_lengths, block_tables, block_size)
+            # TODO Add dummy transpose and squeeze on in/out tensors of the triton flash decoding kernel
+            # in order to maintain compatibility. This part as well as the logics of handling query_states and attn_output
+            # should be revised, as we could see in previous part of `llama_attn_forward` we still have
+            # redundant transpose and the in/out of torch- and triton-version decoding kernel are not consistent.
+            query_states = query_states.transpose(1, 2)
+            attn_output = flash_decoding_attention(
+                query_states, k_cache, v_cache, sequence_lengths, block_tables, block_size
+            )
+            attn_output = attn_output.squeeze(1)
         else:
             attn_output = PagedAttention.pad_decoding_forward(
                 query_states, key_states, value_states, k_cache, v_cache, sequence_lengths, block_tables, attention_mask
diff --git a/colossalai/kernel/triton/flash_decoding.py b/colossalai/kernel/triton/flash_decoding.py
@@ -188,11 +188,11 @@ def flash_decoding_attention(
     v_cache: torch.Tensor,
     kv_seq_len: torch.Tensor,
     block_tables: torch.Tensor,
-    max_seq_len_in_batch: int,
-    mid_output: torch.Tensor,
-    mid_output_lse: torch.Tensor,
     block_size: int,
-    sm_scale: int,
+    max_seq_len_in_batch: int = None,
+    mid_output: torch.Tensor = None,
+    mid_output_lse: torch.Tensor = None,
+    sm_scale: int = None,
     kv_group_num: int = 1,
 ):
     """
@@ -236,6 +236,21 @@ def flash_decoding_attention(
     assert block_size in {16, 32, 64, 128}
     BLOCK_KV = block_size
 
+    sm_scale = 1.0 / (head_dim**0.5) if sm_scale is None else sm_scale
+    max_seq_len_in_batch = kv_seq_len.max().item() if max_seq_len_in_batch is None else max_seq_len_in_batch
+    # For compatibility (TODO revise modeling in future)
+    kv_max_split_num = (max_seq_len_in_batch + BLOCK_KV - 1) // BLOCK_KV
+    mid_output = (
+        torch.zeros(size=(bsz, num_heads, kv_max_split_num, head_dim), dtype=torch.float32, device=q.device)
+        if mid_output is None
+        else mid_output
+    )
+    mid_output_lse = (
+        torch.zeros(size=(bsz, num_heads, kv_max_split_num), dtype=torch.float32, device=q.device)
+        if mid_output_lse is None
+        else mid_output_lse
+    )
+
     grid = (triton.next_power_of_2(bsz), num_heads, triton.cdiv(triton.next_power_of_2(max_seq_len_in_batch), BLOCK_KV))
     _flash_decoding_fwd_kernel[grid](
         q,
diff --git a/tests/test_infer_ops/triton/test_decoding_attn.py b/tests/test_infer_ops/triton/test_decoding_attn.py
@@ -93,10 +93,10 @@ def test_flash_decoding(
         v_cache,
         context_lengths,
         block_tables,
+        block_size,
         max_seq_len_in_b,
         mid_output,
         mid_output_lse,
-        block_size=block_size,
         sm_scale=sm_scale,
         kv_group_num=kv_group_num,
     )  # [bsz, 1, num_heads, head_dim]
@@ -221,10 +221,10 @@ def bench_kernel(
             v_cache,
             kv_lengths,
             block_tables,
+            block_size,
             max_seq_len_in_b,
             mid_output,
             mid_output_lse,
-            block_size=block_size,
             sm_scale=sm_scale,
             kv_group_num=kv_group_num,
         )  # [bsz, 1, num_heads, head_dim]