Skip to content

Commit

Permalink
[PP] Correct cache size check (vllm-project#13873)
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Zheng <zhengy.gator@gmail.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
  • Loading branch information
zhengy001 authored and lk-chen committed Mar 5, 2025
1 parent 5dec91a commit ebf425d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 12 deletions.
13 changes: 7 additions & 6 deletions vllm/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,10 @@ def initialize_cache(self, num_gpu_blocks: int,
This also warms up the model, which may record CUDA graphs.
"""
raise_if_cache_size_invalid(num_gpu_blocks,
self.cache_config.block_size,
self.model_config.max_model_len)
raise_if_cache_size_invalid(
num_gpu_blocks, self.cache_config.block_size,
self.model_config.max_model_len,
self.parallel_config.pipeline_parallel_size)

self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
Expand Down Expand Up @@ -442,13 +443,13 @@ def init_worker_distributed_environment(
parallel_config.pipeline_parallel_size)


def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
max_model_len) -> None:
def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len,
pipeline_parallel_size) -> None:
if num_gpu_blocks <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine.")
max_seq_len = block_size * num_gpu_blocks
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
if max_model_len > max_seq_len:
raise ValueError(
f"The model's max seq len ({max_model_len}) "
Expand Down
13 changes: 7 additions & 6 deletions vllm/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,11 @@ def initialize_cache(self, num_gpu_blocks: int,
This also warms up the model, which may record CUDA graphs.
"""
raise_if_cache_size_invalid(num_gpu_blocks,
self.cache_config.block_size,
self.cache_config.is_attention_free,
self.model_config.max_model_len)
raise_if_cache_size_invalid(
num_gpu_blocks, self.cache_config.block_size,
self.cache_config.is_attention_free,
self.model_config.max_model_len,
self.parallel_config.pipeline_parallel_size)

self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
Expand Down Expand Up @@ -530,7 +531,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):


def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
max_model_len) -> None:
max_model_len, pipeline_parallel_size) -> None:
if is_attention_free and num_gpu_blocks != 0:
raise ValueError("No memory should be allocated for the cache blocks "
f"for an attention-free model, but {num_gpu_blocks} "
Expand All @@ -539,7 +540,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine.")
max_seq_len = block_size * num_gpu_blocks
max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
if not is_attention_free and max_model_len > max_seq_len:
raise ValueError(
f"The model's max seq len ({max_model_len}) "
Expand Down

0 comments on commit ebf425d

Please sign in to comment.