We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 6bb1809 commit 31b905eCopy full SHA for 31b905e
colossalai/inference/modeling/layers/attention.py
@@ -207,7 +207,7 @@ def pad_context_forward(
207
num_kv_heads = k.shape[-2]
208
assert num_heads % num_kv_heads == 0, "num_kv_heads should be divisible by num_heads"
209
num_kv_groups = num_heads // num_kv_heads
210
- block_size = k_cache.shape[-1]
+ block_size = k_cache.size(-2)
211
assert q.shape[0] == k.shape[0] == v.shape[0] == block_tables.shape[0]
212
block_tables.shape[-1] * block_size
213
0 commit comments