From 3d4ceb292c1c03863daf2316b8123b17307050c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= Date: Tue, 21 Nov 2023 08:06:49 +0800 Subject: [PATCH] Fix hanging in the scheduler caused by long prompts (#1534) --- vllm/core/block_manager.py | 27 ++++++++++++++++++++++++--- vllm/core/scheduler.py | 14 ++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 57349e7fe7f92..4ce87f6e5061b 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,4 +1,5 @@ """A block manager that manages token blocks.""" +import enum from typing import Dict, List, Optional, Set, Tuple from vllm.block import PhysicalTokenBlock @@ -54,6 +55,20 @@ def get_num_free_blocks(self) -> int: BlockTable = List[PhysicalTokenBlock] +class AllocStatus(enum.Enum): + """Result for BlockSpaceManager.can_allocate + + 1. Ok: seq_group can be allocated now. + 2. Later: seq_group cannot be allocated. + The capacity of allocator is larger than seq_group required. + 3. Never: seq_group can never be allocated. + The seq_group is too large to allocated in GPU. + """ + OK = enum.auto() + LATER = enum.auto() + NEVER = enum.auto() + + class BlockSpaceManager: """Manages the mapping between logical and physical token blocks.""" @@ -86,7 +101,7 @@ def __init__( # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} - def can_allocate(self, seq_group: SequenceGroup) -> bool: + def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. seq = seq_group.get_seqs()[0] @@ -95,9 +110,15 @@ def can_allocate(self, seq_group: SequenceGroup) -> bool: num_required_blocks = min(num_required_blocks, self.block_sliding_window) num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() + # Use watermark to avoid frequent cache eviction. - return (num_free_gpu_blocks - num_required_blocks >= - self.watermark_blocks) + if (self.num_total_gpu_blocks - num_required_blocks < + self.watermark_blocks): + return AllocStatus.NEVER + if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: + return AllocStatus.OK + else: + return AllocStatus.LATER def allocate(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index b6cb91766eae0..60a4a90bb3eb2 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -3,7 +3,7 @@ from typing import Dict, Iterable, List, Optional, Tuple, Union from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.block_manager import BlockSpaceManager +from vllm.core.block_manager import AllocStatus, BlockSpaceManager from vllm.core.policy import PolicyFactory from vllm.logger import init_logger from vllm.sequence import (Sequence, SequenceData, SequenceGroup, @@ -154,8 +154,18 @@ def _schedule(self) -> SchedulerOutputs: continue # If the sequence group cannot be allocated, stop. - if not self.block_manager.can_allocate(seq_group): + can_allocate = self.block_manager.can_allocate(seq_group) + if can_allocate == AllocStatus.LATER: break + elif can_allocate == AllocStatus.NEVER: + logger.warning( + f"Input prompt ({num_prompt_tokens} tokens) is too long" + f" and exceeds the capacity of block_manager") + for seq in seq_group.get_seqs(): + seq.status = SequenceStatus.FINISHED_IGNORED + ignored_seq_groups.append(seq_group) + self.waiting.pop(0) + continue # If the number of batched tokens exceeds the limit, stop. new_seq_lens = seq_lens + [num_prompt_tokens]