From 3d4ceb292c1c03863daf2316b8123b17307050c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Tue, 21 Nov 2023 08:06:49 +0800
Subject: [PATCH] Fix hanging in the scheduler caused by long prompts (#1534)

---
 vllm/core/block_manager.py | 27 ++++++++++++++++++++++++---
 vllm/core/scheduler.py     | 14 ++++++++++++--
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 57349e7fe7f92..4ce87f6e5061b 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,4 +1,5 @@
 """A block manager that manages token blocks."""
+import enum
 from typing import Dict, List, Optional, Set, Tuple
 
 from vllm.block import PhysicalTokenBlock
@@ -54,6 +55,20 @@ def get_num_free_blocks(self) -> int:
 BlockTable = List[PhysicalTokenBlock]
 
 
+class AllocStatus(enum.Enum):
+    """Result for BlockSpaceManager.can_allocate
+
+    1. Ok: seq_group can be allocated now.
+    2. Later: seq_group cannot be allocated.
+      The capacity of allocator is larger than seq_group required.
+    3. Never: seq_group can never be allocated.
+      The seq_group is too large to allocated in GPU.
+    """
+    OK = enum.auto()
+    LATER = enum.auto()
+    NEVER = enum.auto()
+
+
 class BlockSpaceManager:
     """Manages the mapping between logical and physical token blocks."""
 
@@ -86,7 +101,7 @@ def __init__(
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
 
-    def can_allocate(self, seq_group: SequenceGroup) -> bool:
+    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
         seq = seq_group.get_seqs()[0]
@@ -95,9 +110,15 @@ def can_allocate(self, seq_group: SequenceGroup) -> bool:
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
         num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
+
         # Use watermark to avoid frequent cache eviction.
-        return (num_free_gpu_blocks - num_required_blocks >=
-                self.watermark_blocks)
+        if (self.num_total_gpu_blocks - num_required_blocks <
+                self.watermark_blocks):
+            return AllocStatus.NEVER
+        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
 
     def allocate(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index b6cb91766eae0..60a4a90bb3eb2 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -3,7 +3,7 @@
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.block_manager import BlockSpaceManager
+from vllm.core.block_manager import AllocStatus, BlockSpaceManager
 from vllm.core.policy import PolicyFactory
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
@@ -154,8 +154,18 @@ def _schedule(self) -> SchedulerOutputs:
                     continue
 
                 # If the sequence group cannot be allocated, stop.
-                if not self.block_manager.can_allocate(seq_group):
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if can_allocate == AllocStatus.LATER:
                     break
+                elif can_allocate == AllocStatus.NEVER:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds the capacity of block_manager")
+                    for seq in seq_group.get_seqs():
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
 
                 # If the number of batched tokens exceeds the limit, stop.
                 new_seq_lens = seq_lens + [num_prompt_tokens]