Skip to content

Commit 9941b92

Browse files
Merge pull request vllm-project#8 from robertgshaw2-redhat/avoid-unnatural-free
updated
2 parents ded125b + e49a686 commit 9941b92

File tree

2 files changed

+27
-16
lines changed

2 files changed

+27
-16
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def get_computed_blocks(
107107
- A list of blocks that are computed for the request.
108108
- The number of computed tokens.
109109
"""
110+
110111
if not self.enable_caching:
111112
# Prefix caching is disabled.
112113
return [], 0
@@ -274,8 +275,6 @@ def allocate_slots(
274275
# For disaggregated, avoid caching until KVs are recved.
275276
if skip_cache_blocks:
276277
assert request.request_id not in self.num_cached_block
277-
self.num_cached_block[request.request_id] = len(
278-
new_computed_blocks)
279278
return new_blocks
280279

281280
self.cache_blocks(

vllm/v1/core/sched/scheduler_disagg.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -196,18 +196,22 @@ def schedule(self) -> SchedulerOutput:
196196
# for the requests to arrive.
197197
if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
198198
if request.request_id in self.finished_recving_kv_req_ids:
199-
assert self.kv_cache_manager.enable_caching
200199
# Now that the KVs have been recved, we can cache
201200
# them and set num_computed_tokens.
201+
blocks = self.kv_cache_manager.req_to_blocks[
202+
request.request_id]
203+
num_computed_tokens = len(blocks) * self.block_size
202204
self.kv_cache_manager.cache_blocks(
203205
request,
204206
num_tokens=0,
205-
num_computed_tokens=(len(request.all_token_ids) -
206-
1))
207+
num_computed_tokens=num_computed_tokens)
208+
assert blocks[-1].block_hash is not None
209+
210+
request.num_computed_tokens = num_computed_tokens
211+
request.status = RequestStatus.WAITING
212+
207213
self.finished_recving_kv_req_ids.remove(
208214
request.request_id)
209-
request.status = RequestStatus.WAITING
210-
self.kv_cache_manager.free(request)
211215
else:
212216
self.waiting.popleft()
213217
skipped_waiting_requests.appendleft(request)
@@ -224,10 +228,16 @@ def schedule(self) -> SchedulerOutput:
224228
skipped_waiting_requests.appendleft(request)
225229
continue
226230

227-
# Get already-cached tokens.
228-
computed_blocks, num_computed_tokens = \
229-
self.kv_cache_manager.get_computed_blocks(
230-
request)
231+
# TODO: comment.
232+
request_blocks = self.kv_cache_manager.req_to_blocks.get(
233+
request.request_id, None)
234+
if request_blocks:
235+
new_computed_blocks = []
236+
num_computed_tokens = len(request_blocks) * self.block_size
237+
else:
238+
# Get already-cached tokens.
239+
new_computed_blocks, num_computed_tokens = (
240+
self.kv_cache_manager.get_computed_blocks(request))
231241

232242
# Get externally-cached tokens if using a KVConnector.
233243
num_external_tokens = (
@@ -244,7 +254,7 @@ def schedule(self) -> SchedulerOutput:
244254
new_blocks = self.kv_cache_manager.allocate_slots(
245255
request,
246256
num_external_tokens,
247-
computed_blocks,
257+
new_computed_blocks,
248258
skip_cache_blocks=True)
249259
if new_blocks is None:
250260
# Requests cannot be scheduled
@@ -262,7 +272,7 @@ def schedule(self) -> SchedulerOutput:
262272
request,
263273
[
264274
b.block_id for b in itertools.chain(
265-
computed_blocks, new_blocks)
275+
new_computed_blocks, new_blocks)
266276
],
267277
num_external_tokens,
268278
)
@@ -274,6 +284,8 @@ def schedule(self) -> SchedulerOutput:
274284
# We use `request.num_tokens` instead of
275285
# `request.num_prompt_tokens` to consider the resumed request,
276286
# which have output tokens.
287+
print(f"{request.num_tokens=}")
288+
print(f"{num_computed_tokens=}")
277289
num_new_tokens = request.num_tokens - num_computed_tokens
278290
if (0 < self.scheduler_config.long_prefill_token_threshold <
279291
num_new_tokens):
@@ -298,7 +310,7 @@ def schedule(self) -> SchedulerOutput:
298310
new_blocks = self.kv_cache_manager.allocate_slots(
299311
request,
300312
num_new_tokens + num_external_tokens,
301-
computed_blocks,
313+
new_computed_blocks,
302314
)
303315
if new_blocks is None:
304316
# The request cannot be scheduled.
@@ -312,7 +324,7 @@ def schedule(self) -> SchedulerOutput:
312324
request,
313325
[
314326
b.block_id for b in itertools.chain(
315-
computed_blocks, new_blocks)
327+
new_computed_blocks, new_blocks)
316328
],
317329
num_external_tokens,
318330
)
@@ -335,7 +347,7 @@ def schedule(self) -> SchedulerOutput:
335347
f"Invalid request status: {request.status}")
336348

337349
req_to_new_block_ids[request.request_id] = [
338-
b.block_id for b in computed_blocks + new_blocks
350+
b.block_id for b in new_computed_blocks + new_blocks
339351
]
340352
num_scheduled_tokens[request.request_id] = num_new_tokens
341353
token_budget -= num_new_tokens

0 commit comments

Comments
 (0)