[Bugfix][V1] Re-compute an entire block when fully cache hit (vllm-pr…

…oject#11186) Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
niuzheng168 · Dec 14, 2024 · 9855aea · 9855aea
1 parent 4b5b8a6
commit 9855aea
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput":
                 if num_new_tokens == 0:
                     # The happens when prompt length is divisible by the block
                     # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0