revise block size retrieval

yuanheng-zhao · yuanheng-zhao · commit 31b905e8b86e · 2024-01-30T13:30:03.000+08:00
diff --git a/colossalai/inference/modeling/layers/attention.py b/colossalai/inference/modeling/layers/attention.py
@@ -207,7 +207,7 @@ def pad_context_forward(
         num_kv_heads = k.shape[-2]
         assert num_heads % num_kv_heads == 0, "num_kv_heads should be divisible by num_heads"
         num_kv_groups = num_heads // num_kv_heads
-        block_size = k_cache.shape[-1]
+        block_size = k_cache.size(-2)
         assert q.shape[0] == k.shape[0] == v.shape[0] == block_tables.shape[0]
         block_tables.shape[-1] * block_size