fix

WoosukKwon · WoosukKwon · commit 0fa974799d51 · 2025-04-16T13:04:48.000-07:00
Signed-off-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
diff --git a/vllm/v1/core/hybrid_allocator.py b/vllm/v1/core/hybrid_allocator.py
@@ -166,8 +166,9 @@ def find_longest_cache_hit(
         block_hashes = block_hashes[0]
         if len(block_hashes) * self.block_size == num_tokens:
             block_hashes = block_hashes[:-1]
-        return self.allocator.find_longest_cache_hit(block_hashes,
-                                                     self.group_ids)
+        blocks, num_computed_tokens = self.allocator.find_longest_cache_hit(
+            block_hashes, self.group_ids)
+        return [blocks[0]], num_computed_tokens
 
     def remove_skipped_blocks(
         self,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -52,6 +52,7 @@ def __init__(
             kv_cache_config=kv_cache_config,
             block_pool=self.block_pool,
         )
+        self.num_groups = len(kv_cache_config.kv_cache_groups)
 
     @property
     def usage(self) -> float:
@@ -104,6 +105,7 @@ def get_computed_blocks(
 
         self.prefix_cache_stats.queries += num_tokens
         self.prefix_cache_stats.hits += num_computed_tokens
+        print(f"computed_blocks: {computed_blocks}")
         return computed_blocks, num_computed_tokens
 
     def allocate_slots(
@@ -145,12 +147,13 @@ def allocate_slots(
         assert num_input_tokens + num_draft_tokens > 0
 
         if new_computed_blocks is None:
-            new_computed_blocks = []
+            new_computed_blocks = [[] for _ in range(self.num_groups)]
             assert new_computed_tokens == 0
-        else:
-            assert new_computed_tokens > 0
 
-        req_blocks = self.req_to_blocks[request.request_id]
+        req_blocks = self.req_to_blocks.get(request.request_id)
+        if req_blocks is None:
+            req_blocks = [[] for _ in range(self.num_groups)]
+            self.req_to_blocks[request.request_id] = req_blocks
 
         # Free the blocks that are skipped during the attention computation
         # (e.g., tokens outside the sliding window).
@@ -183,9 +186,9 @@ def allocate_slots(
             return None
 
         # Add the new computed blocks and new blocks to the request.
-        # FIXME
-        req_blocks.extend(new_computed_blocks)
-        req_blocks.extend(new_blocks)
+        for group_id in range(self.num_groups):
+            req_blocks[group_id].extend(new_computed_blocks[group_id])
+            req_blocks[group_id].extend(new_blocks[group_id])
         if not self.enable_caching:
             return new_blocks
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import itertools
 import time
 from collections import deque
 from collections.abc import Iterable
@@ -144,7 +145,7 @@ def schedule(self) -> SchedulerOutput:
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
 
-        req_to_new_block_ids: dict[str, list[int]] = {}
+        req_to_new_block_ids: dict[str, list[list[int]]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
@@ -165,7 +166,8 @@ def schedule(self) -> SchedulerOutput:
                 req_index += 1
                 continue
 
-            num_new_tokens = (request.num_tokens_with_spec -
+            num_draft_tokens = len(request.draft_token_ids)
+            num_new_tokens = (request.num_tokens + num_draft_tokens -
                               request.num_computed_tokens)
             if (0 < self.scheduler_config.long_prefill_token_threshold <
                     num_new_tokens):
@@ -196,7 +198,8 @@ def schedule(self) -> SchedulerOutput:
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
-                    num_new_tokens,
+                    num_new_tokens - num_draft_tokens,
+                    num_draft_tokens=num_draft_tokens,
                     num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
@@ -233,7 +236,7 @@ def schedule(self) -> SchedulerOutput:
                 # cycle to fill in the bitmask, which could be a big no-op.
                 structured_output_request_ids[request.request_id] = req_index
             req_to_new_block_ids[request.request_id] = [
-                b.block_id for b in new_blocks
+                [b.block_id for b in blocks] for blocks in new_blocks
             ]
             num_scheduled_tokens[request.request_id] = num_new_tokens
             token_budget -= num_new_tokens
@@ -330,7 +333,11 @@ def schedule(self) -> SchedulerOutput:
                     new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens, num_computed_tokens, computed_blocks)
+                    request,
+                    num_new_tokens,
+                    new_computed_tokens=num_computed_tokens,
+                    new_computed_blocks=computed_blocks,
+                    num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
@@ -355,9 +362,9 @@ def schedule(self) -> SchedulerOutput:
 
                 if self.lora_config and request.lora_request:
                     scheduled_loras.add(request.lora_request.lora_int_id)
-                req_to_new_block_ids[request.request_id] = [
-                    b.block_id for b in computed_blocks + new_blocks
-                ]
+                req_to_new_block_ids[request.request_id] = [[
+                    b.block_id for b in itertools.chain(b1, b2)
+                ] for b1, b2 in zip(computed_blocks, new_blocks)]
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
@@ -463,7 +470,7 @@ def _make_cached_request_data(
         request: Request,
         num_scheduled_tokens: int,
         num_scheduled_spec_tokens: int,
-        new_block_ids: list[int],
+        new_block_ids: list[list[int]],
         resumed_from_preemption: bool,
     ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -30,7 +30,7 @@ class CachedRequestState:
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
-    block_ids: list[int]
+    block_ids: list[list[int]]
     num_computed_tokens: int
     output_token_ids: list[int]