[Bugfix] Fix scheduling when repeated images in one request (vllm-project#23544)

ywang96 · Roger Wang · knlnguyen1802 · amd-xiaoyu12 · commit 1c68e1cdb27f · 2025-08-28T15:53:52.000Z
Signed-off-by: Roger Wang &lt;hey@rogerw.me&gt;
Signed-off-by: Roger Wang &lt;hey@rogerw.io&gt;
Co-authored-by: Roger Wang &lt;hey@rogerw.me&gt;
Co-authored-by: knlnguyen1802 &lt;knlnguyen1802@gmail.com&gt;
Signed-off-by: Xiao Yu &lt;xiao.yu@amd.com&gt;
diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
@@ -22,7 +22,7 @@ def test_basic_allocate_and_reuse():
     req = MockRequest("r1", ["imgA"], [4])
 
     assert not cache.check_and_update_cache(req, 0)
-    assert cache.try_allocate(req, 0, int(1e9))
+    assert cache.can_allocate(req, 0, int(1e9), 0)
 
     cache.allocate(req, 0)
 
@@ -44,7 +44,7 @@ def test_freeing_decreases_refcount_and_moves_to_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req2", ["img3"], [5])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     assert len(manager.cached["img3"]) == 1
@@ -60,10 +60,10 @@ def test_free_request_frees_all_inputs():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("req3", ["a", "b"], [2, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 1, int(1e9))
+    assert manager.can_allocate(req, 1, int(1e9), 0)
     manager.allocate(req, 1)
 
     assert len(manager.cached["a"]) == 1
@@ -84,11 +84,11 @@ def test_eviction_when_cache_is_full():
     req1 = MockRequest("req1", ["x"], [6])
     req2 = MockRequest("req2", ["y"], [5])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     # 'x' should have been evicted.
@@ -100,10 +100,10 @@ def test_get_cached_input_ids():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
-    assert manager.try_allocate(req, 2, int(1e9))
+    assert manager.can_allocate(req, 2, int(1e9), 0)
     manager.allocate(req, 2)
 
     cached_ids = manager.get_cached_input_ids(req)
@@ -114,7 +114,7 @@ def test_has_cache_restores_from_freeable():
     manager = EncoderCacheManager(cache_size=10)
     req = MockRequest("reqY", ["imgZ"], [4])
 
-    assert manager.try_allocate(req, 0, int(1e9))
+    assert manager.can_allocate(req, 0, int(1e9), 0)
     manager.allocate(req, 0)
 
     manager.free_encoder_input(req, 0)
@@ -131,14 +131,41 @@ def test_get_freed_mm_hashes_clears_freed_list():
     req1 = MockRequest("reqA", ["a"], [5])
     req2 = MockRequest("reqB", ["b"], [6])
 
-    assert manager.try_allocate(req1, 0, int(1e9))
+    assert manager.can_allocate(req1, 0, int(1e9), 0)
     manager.allocate(req1, 0)
     manager.free_encoder_input(req1, 0)
 
     # Should trigger eviction of 'a'.
-    assert manager.try_allocate(req2, 0, int(1e9))
+    assert manager.can_allocate(req2, 0, int(1e9), 0)
     manager.allocate(req2, 0)
 
     freed = manager.get_freed_mm_hashes()
     assert "a" in freed
     assert manager.get_freed_mm_hashes() == []
+
+
+def test_schedule_request_multi_images_respect_space_limit():
+    manager = EncoderCacheManager(cache_size=10)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 100
+
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
+
+
+def test_schedule_request_multi_images_respect_compute_limit():
+    manager = EncoderCacheManager(cache_size=100)
+    req = MockRequest("reqA", ["a", "b"], [5, 6])
+    compute_budget = 10
+    num_tokens_to_schedule = 0
+    assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
+    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
+    compute_budget -= req.get_num_encoder_tokens(0)
+
+    assert not manager.can_allocate(req, 1, compute_budget,
+                                    num_tokens_to_schedule)
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
@@ -99,8 +99,9 @@ def check_and_update_cache(self, request: Request, input_id: int) -> bool:
         self.cached[mm_hash].add(request.request_id)
         return True
 
-    def try_allocate(self, request: Request, input_id: int,
-                     encoder_budget: int) -> bool:
+    def can_allocate(self, request: Request, input_id: int,
+                     encoder_compute_budget: int,
+                     num_tokens_to_schedule: int) -> bool:
         """Check if there's sufficient cache space for a multimodal input. 
         If there is, return True and update EncoderCacheManager state.
 
@@ -116,6 +117,10 @@ def try_allocate(self, request: Request, input_id: int,
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
+            encoder_compute_budget: Number of encoder tokens allowed to be 
+                computed when this method is invoked.
+            num_tokens_to_schedule: Number of tokens already scheduled to be 
+                allocated with cache space when this method is invoked.
 
         Returns:
             True if there's enough capacity to hold the encoder output for this
@@ -128,13 +133,13 @@ def try_allocate(self, request: Request, input_id: int,
         num_tokens = request.get_num_encoder_tokens(input_id)
 
         # Not enough compute budget
-        if num_tokens > encoder_budget:
+        if num_tokens > encoder_compute_budget:
             return False
 
+        num_tokens += num_tokens_to_schedule
+
         # Enough free slots
         if num_tokens <= self.num_free_slots:
-            self.num_free_slots -= num_tokens
-            self.num_freeable_slots -= num_tokens
             return True
 
         # Not enough reclaimable slots
@@ -149,8 +154,6 @@ def try_allocate(self, request: Request, input_id: int,
             del self.cached[mm_hash]
             self.freed.append(mm_hash)
             self.num_free_slots += num_free_token
-        self.num_free_slots -= num_tokens
-        self.num_freeable_slots -= num_tokens
         return True
 
     def allocate(self, request: Request, input_id: int) -> None:
@@ -161,19 +164,24 @@ def allocate(self, request: Request, input_id: int) -> None:
         the model runner; this method updates the manager's bookkeeping.
 
         Note:
-            This method assumes try_allocate() returned True for the same input.
+            This method assumes can_allocate() returned True for the same input.
         """
-        # Encoder cache space budget should be already updated for the
-        # multimodal input and non-negative after try_allocate() is called.
-        assert self.num_free_slots >= 0
-        assert self.num_freeable_slots >= 0
 
         mm_hash = request.mm_hashes[input_id]
         request_id = request.request_id
         if mm_hash not in self.cached:
             self.cached[mm_hash] = set()
 
+        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+
+        # NOTE: Encoder cache should always have enough space for encoder inputs
+        # that are scheduled since eviction takes place at can_allocate().
+        assert self.num_free_slots >= num_encoder_tokens
+        assert self.num_freeable_slots >= num_encoder_tokens
+
         self.cached[mm_hash].add(request_id)
+        self.num_free_slots -= num_encoder_tokens
+        self.num_freeable_slots -= num_encoder_tokens
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -182,7 +182,7 @@ def schedule(self) -> SchedulerOutput:
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
         scheduled_encoder_inputs: dict[str, list[int]] = {}
-        encoder_budget = self.max_num_encoder_input_tokens
+        encoder_compute_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
         scheduled_spec_decode_tokens: dict[str, list[int]] = {}
 
@@ -211,12 +211,13 @@ def schedule(self) -> SchedulerOutput:
 
             # Schedule encoder inputs.
             encoder_inputs_to_schedule = None
-            new_encoder_budget = encoder_budget
+            new_encoder_compute_budget = encoder_compute_budget
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                 new_encoder_compute_budget
+                 ) = self._try_schedule_encoder_inputs(
                      request, request.num_computed_tokens, num_new_tokens,
-                     encoder_budget)
+                     encoder_compute_budget)
 
             if num_new_tokens == 0:
                 # The request cannot be scheduled because one of the following
@@ -298,7 +299,7 @@ def schedule(self) -> SchedulerOutput:
                 # Allocate the encoder cache.
                 for i in encoder_inputs_to_schedule:
                     self.encoder_cache_manager.allocate(request, i)
-                encoder_budget = new_encoder_budget
+                encoder_compute_budget = new_encoder_compute_budget
 
         # Record the LoRAs in scheduled_running_reqs
         scheduled_loras: set[int] = set()
@@ -382,7 +383,7 @@ def schedule(self) -> SchedulerOutput:
                     num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
-                new_encoder_budget = encoder_budget
+                new_encoder_compute_budget = encoder_compute_budget
 
                 # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
@@ -413,10 +414,10 @@ def schedule(self) -> SchedulerOutput:
                     # Schedule encoder inputs.
                     if request.has_encoder_inputs:
                         (encoder_inputs_to_schedule, num_new_tokens,
-                         new_encoder_budget
+                         new_encoder_compute_budget
                          ) = self._try_schedule_encoder_inputs(
                              request, num_computed_tokens, num_new_tokens,
-                             encoder_budget)
+                             encoder_compute_budget)
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
@@ -495,7 +496,7 @@ def schedule(self) -> SchedulerOutput:
                     # Allocate the encoder cache.
                     for i in encoder_inputs_to_schedule:
                         self.encoder_cache_manager.allocate(request, i)
-                    encoder_budget = new_encoder_budget
+                    encoder_compute_budget = new_encoder_compute_budget
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
@@ -658,7 +659,7 @@ def _try_schedule_encoder_inputs(
         request: Request,
         num_computed_tokens: int,
         num_new_tokens: int,
-        encoder_budget: int,
+        encoder_compute_budget: int,
     ) -> tuple[list[int], int, int]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -680,11 +681,17 @@ def _try_schedule_encoder_inputs(
         blocks and externally cached blocks (via KVConnector).
         """
         if num_new_tokens == 0 or not request.has_encoder_inputs:
-            return [], num_new_tokens, encoder_budget
+            return [], num_new_tokens, encoder_compute_budget
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
         assert len(mm_positions) > 0
+
+        # NOTE: since scheduler operates on the request level (possibly with
+        # multiple encoder inputs per request), we need to create temporary
+        # trackers for accounting at the encoder input level.
+        mm_hashes_to_schedule = set()
+        num_tokens_to_schedule = 0
         for i, pos_info in enumerate(mm_positions):
             start_pos = pos_info.offset
             num_encoder_tokens = pos_info.length
@@ -695,13 +702,20 @@ def _try_schedule_encoder_inputs(
             if start_pos >= num_computed_tokens + num_new_tokens:
                 # The encoder input is not needed in this step.
                 break
+
             if start_pos + num_encoder_tokens <= num_computed_tokens:
                 # The encoder input is already computed and stored
                 # in the decoder's KV cache.
                 continue
 
+            # The same encoder input has already been scheduled in the current
+            # step.
+            if request.mm_hashes[i] in mm_hashes_to_schedule:
+                continue
+
             if self.encoder_cache_manager.check_and_update_cache(request, i):
-                # The encoder input is already computed and cached.
+                # The encoder input is already computed and cached from a
+                # previous step.
                 continue
 
             # If no encoder input chunking is allowed, we do not want to
@@ -714,8 +728,9 @@ def _try_schedule_encoder_inputs(
                 num_new_tokens = start_pos - num_computed_tokens
                 break
 
-            if not self.encoder_cache_manager.try_allocate(
-                    request, i, encoder_budget):
+            if not self.encoder_cache_manager.can_allocate(
+                    request, i, encoder_compute_budget,
+                    num_tokens_to_schedule):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
@@ -732,9 +747,16 @@ def _try_schedule_encoder_inputs(
                     num_new_tokens = 0
                 break
 
-            encoder_budget -= num_encoder_tokens
+            num_tokens_to_schedule += num_encoder_tokens
+            encoder_compute_budget -= num_encoder_tokens
+            mm_hashes_to_schedule.add(request.mm_hashes[i])
             encoder_inputs_to_schedule.append(i)
-        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
+        return (
+            encoder_inputs_to_schedule,
+            num_new_tokens,
+            encoder_compute_budget,
+        )
 
     def get_grammar_bitmask(
         self,