@@ -395,12 +395,12 @@ def _schedule_running(
395395 # We can have up to 1 running prefill at any given time in running
396396 # queue, which means we can guarantee chunk size is at least 1.
397397 assert num_running_tokens != 0
398- num_running_seqs = seq_group .get_max_num_running_seqs ()
399398
400399 running_queue .popleft ()
401400 while not self ._can_append_slots (seq_group ):
402401 budget .subtract_num_batched_tokens (seq_group .request_id ,
403402 num_running_tokens )
403+ num_running_seqs = seq_group .get_max_num_running_seqs ()
404404 budget .subtract_num_seqs (seq_group .request_id ,
405405 num_running_seqs )
406406 if curr_loras is not None and seq_group .lora_int_id > 0 :
@@ -439,7 +439,13 @@ def _schedule_running(
439439 token_chunk_size = 1 ))
440440 budget .add_num_batched_tokens (seq_group .request_id ,
441441 num_running_tokens )
442- budget .add_num_seqs (seq_group .request_id , num_running_seqs )
442+ # OPTIMIZATION: Note that get_max_num_running_seqs is
443+ # expensive. For the default scheduling chase where
444+ # enable_chunking is False, num_seqs are updated before running
445+ # this method, so we don't have to update it again here.
446+ if enable_chunking :
447+ num_running_seqs = seq_group .get_max_num_running_seqs ()
448+ budget .add_num_seqs (seq_group .request_id , num_running_seqs )
443449 if curr_loras is not None and seq_group .lora_int_id > 0 :
444450 curr_loras .add (seq_group .lora_int_id )
445451
0 commit comments