@@ -196,18 +196,22 @@ def schedule(self) -> SchedulerOutput:
196196 # for the requests to arrive.
197197 if request .status == RequestStatus .WAITING_FOR_REMOTE_KVS :
198198 if request .request_id in self .finished_recving_kv_req_ids :
199- assert self .kv_cache_manager .enable_caching
200199 # Now that the KVs have been recved, we can cache
201200 # them and set num_computed_tokens.
201+ blocks = self .kv_cache_manager .req_to_blocks [
202+ request .request_id ]
203+ num_computed_tokens = len (blocks ) * self .block_size
202204 self .kv_cache_manager .cache_blocks (
203205 request ,
204206 num_tokens = 0 ,
205- num_computed_tokens = (len (request .all_token_ids ) -
206- 1 ))
207+ num_computed_tokens = num_computed_tokens )
208+ assert blocks [- 1 ].block_hash is not None
209+
210+ request .num_computed_tokens = num_computed_tokens
211+ request .status = RequestStatus .WAITING
212+
207213 self .finished_recving_kv_req_ids .remove (
208214 request .request_id )
209- request .status = RequestStatus .WAITING
210- self .kv_cache_manager .free (request )
211215 else :
212216 self .waiting .popleft ()
213217 skipped_waiting_requests .appendleft (request )
@@ -224,10 +228,16 @@ def schedule(self) -> SchedulerOutput:
224228 skipped_waiting_requests .appendleft (request )
225229 continue
226230
227- # Get already-cached tokens.
228- computed_blocks , num_computed_tokens = \
229- self .kv_cache_manager .get_computed_blocks (
230- request )
231+ # TODO: comment.
232+ request_blocks = self .kv_cache_manager .req_to_blocks .get (
233+ request .request_id , None )
234+ if request_blocks :
235+ new_computed_blocks = []
236+ num_computed_tokens = len (request_blocks ) * self .block_size
237+ else :
238+ # Get already-cached tokens.
239+ new_computed_blocks , num_computed_tokens = (
240+ self .kv_cache_manager .get_computed_blocks (request ))
231241
232242 # Get externally-cached tokens if using a KVConnector.
233243 num_external_tokens = (
@@ -244,7 +254,7 @@ def schedule(self) -> SchedulerOutput:
244254 new_blocks = self .kv_cache_manager .allocate_slots (
245255 request ,
246256 num_external_tokens ,
247- computed_blocks ,
257+ new_computed_blocks ,
248258 skip_cache_blocks = True )
249259 if new_blocks is None :
250260 # Requests cannot be scheduled
@@ -262,7 +272,7 @@ def schedule(self) -> SchedulerOutput:
262272 request ,
263273 [
264274 b .block_id for b in itertools .chain (
265- computed_blocks , new_blocks )
275+ new_computed_blocks , new_blocks )
266276 ],
267277 num_external_tokens ,
268278 )
@@ -274,6 +284,8 @@ def schedule(self) -> SchedulerOutput:
274284 # We use `request.num_tokens` instead of
275285 # `request.num_prompt_tokens` to consider the resumed request,
276286 # which have output tokens.
287+ print (f"{ request .num_tokens = } " )
288+ print (f"{ num_computed_tokens = } " )
277289 num_new_tokens = request .num_tokens - num_computed_tokens
278290 if (0 < self .scheduler_config .long_prefill_token_threshold <
279291 num_new_tokens ):
@@ -298,7 +310,7 @@ def schedule(self) -> SchedulerOutput:
298310 new_blocks = self .kv_cache_manager .allocate_slots (
299311 request ,
300312 num_new_tokens + num_external_tokens ,
301- computed_blocks ,
313+ new_computed_blocks ,
302314 )
303315 if new_blocks is None :
304316 # The request cannot be scheduled.
@@ -312,7 +324,7 @@ def schedule(self) -> SchedulerOutput:
312324 request ,
313325 [
314326 b .block_id for b in itertools .chain (
315- computed_blocks , new_blocks )
327+ new_computed_blocks , new_blocks )
316328 ],
317329 num_external_tokens ,
318330 )
@@ -335,7 +347,7 @@ def schedule(self) -> SchedulerOutput:
335347 f"Invalid request status: { request .status } " )
336348
337349 req_to_new_block_ids [request .request_id ] = [
338- b .block_id for b in computed_blocks + new_blocks
350+ b .block_id for b in new_computed_blocks + new_blocks
339351 ]
340352 num_scheduled_tokens [request .request_id ] = num_new_tokens
341353 token_budget -= num_new_tokens
0 commit comments