Skip to content

Commit 0df68e0

Browse files
committed
metric
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
1 parent aa1e77a commit 0df68e0

File tree

3 files changed

+39
-3
lines changed

3 files changed

+39
-3
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from vllm.logger import init_logger
55
from vllm.utils import cdiv
66
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
7-
KVCacheBlock,
7+
KVCacheBlock, PrefixCachingMetrics,
88
generate_block_hash_extra_keys,
99
hash_block_tokens,
1010
hash_request_tokens)
@@ -69,6 +69,12 @@ def __init__(
6969
# is finished.
7070
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
7171

72+
# Prefix cache metrics.
73+
self.prefix_caching_metrics: PrefixCachingMetrics = {
74+
"query_total": 0,
75+
"query_hit": 0,
76+
}
77+
7278
def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
7379
"""Get the computed (cached) blocks for the request.
7480
Note that the computed blocks must be full.
@@ -101,6 +107,8 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
101107
else:
102108
break
103109

110+
self.prefix_caching_metrics["query_total"] += len(block_hashes)
111+
self.prefix_caching_metrics["query_hit"] += len(computed_blocks)
104112
return computed_blocks
105113

106114
def append_slots(
@@ -328,6 +336,17 @@ def get_num_common_prefix_blocks(
328336
break
329337
return num_common_blocks
330338

339+
def get_prefix_caching_hit_rate(self) -> float:
340+
"""Get the hit rate of prefix caching.
341+
342+
Returns:
343+
The hit rate of prefix caching.
344+
"""
345+
if self.prefix_caching_metrics["query_total"] == 0:
346+
return 0.0
347+
return self.prefix_caching_metrics[
348+
"query_hit"] / self.prefix_caching_metrics["query_total"]
349+
331350
def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
332351
"""Get new blocks from the free block pool.
333352

vllm/v1/core/kv_cache_utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""KV-Cache Utilities."""
22
from collections.abc import Sequence
33
from dataclasses import dataclass
4-
from typing import Any, List, NamedTuple, Optional, Tuple
4+
from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict
55

66
from vllm.logger import init_logger
77
from vllm.v1.request import Request
@@ -24,6 +24,16 @@ class BlockHashType(NamedTuple):
2424
extra_keys: Optional[Any] = None
2525

2626

27+
class PrefixCachingMetrics(TypedDict):
28+
"""Metrics for prefix caching."""
29+
30+
query_total: int
31+
"""The total number of queries."""
32+
33+
query_hit: int
34+
"""The number of queries that hit the prefix cache."""
35+
36+
2737
@dataclass
2838
class KVCacheBlock:
2939
"""KV-cache block metadata."""

vllm/v1/engine/core.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,17 @@ def _log_stats(self):
244244
now = time.time()
245245

246246
if now - self._last_logging_time > LOGGING_TIME_S:
247+
prefix_caching_hit_rate = ""
248+
if (hit_rate := self.scheduler.kv_cache_manager.
249+
get_prefix_caching_hit_rate()) > 0:
250+
prefix_caching_hit_rate = (
251+
f" | PrefixCachingHitRate: {hit_rate:.2f}")
252+
247253
logger.info(
248-
"RUNNING: %s | WAITING: %s",
254+
"RUNNING: %s | WAITING: %s%s",
249255
len(self.scheduler.running),
250256
len(self.scheduler.waiting),
257+
prefix_caching_hit_rate,
251258
)
252259

253260
self._last_logging_time = now

0 commit comments

Comments
 (0)