Skip to content

[Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers #19029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 83 additions & 7 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,11 @@
from vllm.v1.core.kv_cache_manager import KVCacheManager
# disable yapf here as it formats differently than isort such that both fail
# yapf: disable
from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock,
PrefixCachingMetrics,
estimate_max_model_len,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens,
unify_kv_cache_configs)
from vllm.v1.core.kv_cache_utils import (
FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
estimate_max_model_len, generate_block_hash_extra_keys,
get_max_concurrency_for_kv_cache_config, hash_block_tokens,
hash_request_tokens, unify_kv_cache_configs)
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheTensor,
SlidingWindowSpec)
Expand Down Expand Up @@ -598,6 +596,84 @@ def test_estimate_max_model_len(model_id, max_model_len,
assert estimated_max_len == want_estimated_max_len


def test_get_max_concurrency_for_kv_cache_config():
# Create a VllmConfig
model_id = "Qwen/Qwen1.5-7B"
max_model_len = 16384
model_config = ModelConfig(
model_id,
task="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(max_num_batched_tokens=1024,
enable_chunked_prefill=True)

vllm_config = VllmConfig(
model_config=model_config,
scheduler_config=scheduler_config,
)

full_attention_spec = FullAttentionSpec(
block_size=16,
num_kv_heads=32,
head_size=128,
dtype=torch.float16,
use_mla=False,
)

sliding_window_spec = SlidingWindowSpec(
block_size=16,
num_kv_heads=32,
head_size=128,
dtype=torch.float16,
use_mla=False,
sliding_window=1024,
)

kv_cache_config_full_attention = KVCacheConfig(
num_blocks=int(1024 * 1.5),
tensors={},
kv_cache_groups=[
KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
full_attention_spec),
],
)
max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config_full_attention)
assert max_concurrency_full_attention == 1.5

kv_cache_config_sliding_window = KVCacheConfig(
num_blocks=129 * 3,
tensors={},
kv_cache_groups=[
KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
sliding_window_spec),
],
)
max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config_sliding_window)
assert max_concurrency_sliding_window == 3

kv_cache_config_hybrid_model = KVCacheConfig(
num_blocks=(1024 + 129) * 3,
tensors={},
kv_cache_groups=[
KVCacheGroupSpec([f"layer_{i}" for i in range(32)],
full_attention_spec),
KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)],
sliding_window_spec),
],
)
max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config_hybrid_model)
assert max_concurrency_hybrid_model == 3


def test_allocate_with_lookahead():
"""Verify that lookahead tokens correctly affect block allocation"""
block_size = 4
Expand Down
61 changes: 42 additions & 19 deletions vllm/v1/core/kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
"""KV-Cache Utilities."""
import os
from collections import deque
from collections.abc import Sequence
from collections.abc import Iterable, Sequence
from dataclasses import dataclass
from typing import Any, Callable, NamedTuple, Optional

from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.utils import GiB_bytes, sha256
from vllm.utils import GiB_bytes, cdiv, sha256
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec, KVCacheSpec,
KVCacheTensor, SlidingWindowSpec)
Expand Down Expand Up @@ -464,6 +464,15 @@ def hash_request_tokens(hash_function: Any, block_size: int,
return ret


def max_memory_usage_bytes(vllm_config: VllmConfig,
kv_cache_specs: Iterable[KVCacheSpec]) -> int:
"""
Get the maximum memory usage in bytes for the given KV cache specs.
"""
return sum(
spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs)


def estimate_max_model_len(vllm_config: VllmConfig,
kv_cache_spec: dict[str, KVCacheSpec],
available_memory: int) -> int:
Expand All @@ -485,11 +494,8 @@ def fits_in_memory(model_len: int) -> bool:
# Modify the max_model_len for this calculation
vllm_config.model_config.max_model_len = model_len
# Calculate memory needed for the given model length
memory_needed = sum(
(layer_spec.max_memory_usage_bytes(vllm_config)
for layer_spec in kv_cache_spec.values()),
start=0,
)
memory_needed = max_memory_usage_bytes(vllm_config,
kv_cache_spec.values())
return memory_needed <= available_memory

# Binary search for the maximum model length
Expand Down Expand Up @@ -534,9 +540,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
"initializing the engine.")

max_model_len = vllm_config.model_config.max_model_len
needed_memory = 0
for layer_spec in kv_cache_spec.values():
needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())

if needed_memory > available_memory:
# Estimate the maximum model length that can fit in the available memory
Expand Down Expand Up @@ -602,6 +606,24 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
return len(layer_keys) == 1


def get_max_concurrency_for_kv_cache_config(
vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float:
"""
Get the maximum concurrency for the given KV cache configuration.
"""
num_layer_per_group = max(
len(group.layer_names) for group in kv_cache_config.kv_cache_groups)
max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
vllm_config,
(group.kv_cache_spec for group in kv_cache_config.kv_cache_groups))
memory_per_block = kv_cache_config.kv_cache_groups[
0].kv_cache_spec.page_size_bytes * num_layer_per_group
num_block_per_request = cdiv(max_memory_usage_per_request,
memory_per_block)
max_concurrency = kv_cache_config.num_blocks / num_block_per_request
return max_concurrency


def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
kv_cache_spec: dict[str, KVCacheSpec],
available_memory: int) -> KVCacheConfig:
Expand Down Expand Up @@ -633,14 +655,6 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
"num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
num_blocks = num_gpu_blocks_override

num_tokens = num_blocks * vllm_config.cache_config.block_size
num_tokens_str = f"{num_tokens:,}"
logger.info("GPU KV cache size: %s tokens", num_tokens_str)
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
max_concurrency = num_tokens / vllm_config.model_config.max_model_len
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
max_model_len_str, max_concurrency)

per_layer_size = page_size * num_blocks
# All layers have the same KV cache spec, so we create one kv cache group
# for all layers.
Expand All @@ -655,6 +669,15 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
grouped_layer_names),
)

num_tokens = num_blocks * vllm_config.cache_config.block_size
num_tokens_str = f"{num_tokens:,}"
logger.info("GPU KV cache size: %s tokens", num_tokens_str)
max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
max_concurrency = get_max_concurrency_for_kv_cache_config(
vllm_config, kv_cache_config)
logger.info("Maximum concurrency for %s tokens per request: %.2fx",
max_model_len_str, max_concurrency)
return kv_cache_config


Expand Down Expand Up @@ -701,8 +724,8 @@ def get_kv_cache_config(vllm_config: VllmConfig,
Returns:
The generated KVCacheConfigs
"""
check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
unify_hybrid_kv_cache_specs(kv_cache_spec)
check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
if is_kv_cache_type_uniform(kv_cache_spec):
# KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for
Expand Down