@@ -373,18 +373,22 @@ def _try_schedule_encoder_inputs(
373373 if self .encoder_cache_manager .has_cache (request , i ):
374374 # The encoder input is already computed and cached.
375375 continue
376- if not self .encoder_cache_manager .can_allocate (request , i ):
377- # The encoder cache is full. We can only schedule the decoder
378- # tokens just before the encoder input.
379- num_new_tokens = start_pos - num_computed_tokens
380- break
381- if num_encoder_tokens > encoder_budget :
382- # The encoder budget is exhausted. We can only schedule the
383- # decoder tokens up until the encoder input.
384- # NOTE(woosuk): We assume that the encoder tokens should be
385- # processed altogether, as the encoder usually uses
376+ if (not self .encoder_cache_manager .can_allocate (request , i )
377+ or num_encoder_tokens > encoder_budget ):
378+ # The encoder cache is full or the encoder budget is exhausted.
379+ # NOTE(woosuk): We assume that the encoder input tokens should
380+ # be processed altogether, as the encoder usually uses
386381 # bidirectional attention.
387- num_new_tokens = start_pos - num_computed_tokens
382+ if num_computed_tokens < start_pos :
383+ # We only schedule the decoder tokens just before the
384+ # encoder input.
385+ num_new_tokens = start_pos - num_computed_tokens
386+ else :
387+ # Because of prefix caching, num_computed_tokens is greater
388+ # than start_pos even though its encoder input is not
389+ # available. In this case, we can't schedule any token for
390+ # the request in this step.
391+ num_new_tokens = 0
388392 break
389393
390394 encoder_budget -= num_encoder_tokens
0 commit comments