@@ -182,7 +182,7 @@ def schedule(self) -> SchedulerOutput:
182182 token_budget = self .max_num_scheduled_tokens
183183 # Encoder-related.
184184 scheduled_encoder_inputs : dict [str , list [int ]] = {}
185- encoder_budget = self .max_num_encoder_input_tokens
185+ encoder_compute_budget = self .max_num_encoder_input_tokens
186186 # Spec decode-related.
187187 scheduled_spec_decode_tokens : dict [str , list [int ]] = {}
188188
@@ -211,12 +211,13 @@ def schedule(self) -> SchedulerOutput:
211211
212212 # Schedule encoder inputs.
213213 encoder_inputs_to_schedule = None
214- new_encoder_budget = encoder_budget
214+ new_encoder_compute_budget = encoder_compute_budget
215215 if request .has_encoder_inputs :
216216 (encoder_inputs_to_schedule , num_new_tokens ,
217- new_encoder_budget ) = self ._try_schedule_encoder_inputs (
217+ new_encoder_compute_budget
218+ ) = self ._try_schedule_encoder_inputs (
218219 request , request .num_computed_tokens , num_new_tokens ,
219- encoder_budget )
220+ encoder_compute_budget )
220221
221222 if num_new_tokens == 0 :
222223 # The request cannot be scheduled because one of the following
@@ -298,7 +299,7 @@ def schedule(self) -> SchedulerOutput:
298299 # Allocate the encoder cache.
299300 for i in encoder_inputs_to_schedule :
300301 self .encoder_cache_manager .allocate (request , i )
301- encoder_budget = new_encoder_budget
302+ encoder_compute_budget = new_encoder_compute_budget
302303
303304 # Record the LoRAs in scheduled_running_reqs
304305 scheduled_loras : set [int ] = set ()
@@ -382,7 +383,7 @@ def schedule(self) -> SchedulerOutput:
382383 num_computed_tokens = request .num_computed_tokens
383384
384385 encoder_inputs_to_schedule = None
385- new_encoder_budget = encoder_budget
386+ new_encoder_compute_budget = encoder_compute_budget
386387
387388 # KVTransfer: loading remote KV, do not allocate for new work.
388389 if load_kv_async :
@@ -413,10 +414,10 @@ def schedule(self) -> SchedulerOutput:
413414 # Schedule encoder inputs.
414415 if request .has_encoder_inputs :
415416 (encoder_inputs_to_schedule , num_new_tokens ,
416- new_encoder_budget
417+ new_encoder_compute_budget
417418 ) = self ._try_schedule_encoder_inputs (
418419 request , num_computed_tokens , num_new_tokens ,
419- encoder_budget )
420+ encoder_compute_budget )
420421 if num_new_tokens == 0 :
421422 # The request cannot be scheduled.
422423 break
@@ -495,7 +496,7 @@ def schedule(self) -> SchedulerOutput:
495496 # Allocate the encoder cache.
496497 for i in encoder_inputs_to_schedule :
497498 self .encoder_cache_manager .allocate (request , i )
498- encoder_budget = new_encoder_budget
499+ encoder_compute_budget = new_encoder_compute_budget
499500
500501 # Put back any skipped requests at the head of the waiting queue
501502 if skipped_waiting_requests :
@@ -658,7 +659,7 @@ def _try_schedule_encoder_inputs(
658659 request : Request ,
659660 num_computed_tokens : int ,
660661 num_new_tokens : int ,
661- encoder_budget : int ,
662+ encoder_compute_budget : int ,
662663 ) -> tuple [list [int ], int , int ]:
663664 """
664665 Determine which encoder inputs need to be scheduled in the current step,
@@ -680,11 +681,17 @@ def _try_schedule_encoder_inputs(
680681 blocks and externally cached blocks (via KVConnector).
681682 """
682683 if num_new_tokens == 0 or not request .has_encoder_inputs :
683- return [], num_new_tokens , encoder_budget
684+ return [], num_new_tokens , encoder_compute_budget
684685 encoder_inputs_to_schedule : list [int ] = []
685686 mm_positions = request .mm_positions
686687 assert mm_positions is not None
687688 assert len (mm_positions ) > 0
689+
690+ # NOTE: since scheduler operates on the request level (possibly with
691+ # multiple encoder inputs per request), we need to create temporary
692+ # trackers for accounting at the encoder input level.
693+ mm_hashes_to_schedule = set ()
694+ num_tokens_to_schedule = 0
688695 for i , pos_info in enumerate (mm_positions ):
689696 start_pos = pos_info .offset
690697 num_encoder_tokens = pos_info .length
@@ -695,13 +702,20 @@ def _try_schedule_encoder_inputs(
695702 if start_pos >= num_computed_tokens + num_new_tokens :
696703 # The encoder input is not needed in this step.
697704 break
705+
698706 if start_pos + num_encoder_tokens <= num_computed_tokens :
699707 # The encoder input is already computed and stored
700708 # in the decoder's KV cache.
701709 continue
702710
711+ # The same encoder input has already been scheduled in the current
712+ # step.
713+ if request .mm_hashes [i ] in mm_hashes_to_schedule :
714+ continue
715+
703716 if self .encoder_cache_manager .check_and_update_cache (request , i ):
704- # The encoder input is already computed and cached.
717+ # The encoder input is already computed and cached from a
718+ # previous step.
705719 continue
706720
707721 # If no encoder input chunking is allowed, we do not want to
@@ -714,8 +728,9 @@ def _try_schedule_encoder_inputs(
714728 num_new_tokens = start_pos - num_computed_tokens
715729 break
716730
717- if not self .encoder_cache_manager .try_allocate (
718- request , i , encoder_budget ):
731+ if not self .encoder_cache_manager .can_allocate (
732+ request , i , encoder_compute_budget ,
733+ num_tokens_to_schedule ):
719734 # The encoder cache is full or the encoder budget is exhausted.
720735 # NOTE(woosuk): We assume that the encoder input tokens should
721736 # be processed altogether, as the encoder usually uses
@@ -732,9 +747,16 @@ def _try_schedule_encoder_inputs(
732747 num_new_tokens = 0
733748 break
734749
735- encoder_budget -= num_encoder_tokens
750+ num_tokens_to_schedule += num_encoder_tokens
751+ encoder_compute_budget -= num_encoder_tokens
752+ mm_hashes_to_schedule .add (request .mm_hashes [i ])
736753 encoder_inputs_to_schedule .append (i )
737- return encoder_inputs_to_schedule , num_new_tokens , encoder_budget
754+
755+ return (
756+ encoder_inputs_to_schedule ,
757+ num_new_tokens ,
758+ encoder_compute_budget ,
759+ )
738760
739761 def get_grammar_bitmask (
740762 self ,
0 commit comments