Skip to content

[V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. #18354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ def make_stats(
return SchedulerStats(
num_running_reqs=len(self.running),
num_waiting_reqs=len(self.waiting),
gpu_cache_usage=self.kv_cache_manager.usage,
kv_cache_usage=self.kv_cache_manager.usage,
prefix_cache_stats=prefix_cache_stats,
spec_decoding_stats=spec_decoding_stats,
)
Expand Down
45 changes: 39 additions & 6 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def log(self):
generation_throughput,
scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs,
scheduler_stats.gpu_cache_usage * 100,
scheduler_stats.kv_cache_usage * 100,
self.prefix_caching_metrics.hit_rate * 100,
)
self.spec_decoding_logging.log(log_fn=log_fn)
Expand Down Expand Up @@ -184,22 +184,49 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
#
# GPU cache
#
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
multiprocess_mode="mostrecent",
labelnames=labelnames).labels(*labelvalues)

# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries",
documentation=
"GPU prefix cache queries, in terms of number of queried tokens.",
("GPU prefix cache queries, in terms of number of queried tokens."
"DEPRECATED: Use vllm:prefix_cache_queries instead."),
labelnames=labelnames).labels(*labelvalues)

# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits",
documentation=
"GPU prefix cache hits, in terms of number of cached tokens.",
documentation=(
"GPU prefix cache hits, in terms of number of cached tokens."
"DEPRECATED: Use vllm:prefix_cache_hits instead."),
labelnames=labelnames).labels(*labelvalues)

self.gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc",
documentation="KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames).labels(*labelvalues)

self.counter_prefix_cache_queries = self._counter_cls(
name="vllm:prefix_cache_queries",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renaming existing metrics will break anyone relying on this. Right way to do this would be to add new metrics with the new name and deprecate the old ones so we give enough notice before removing them in a future release. I'm not sure of the exact deprecation policy with vLLM, but it would be good to follow the policy here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @achandrasekar , makes sense!
Referring to the metrics deprecation policy here,

Note: when metrics are deprecated in version X.Y, they are hidden in version X.Y+1 but can be re-enabled using the --show-hidden-metrics-for-version=X.Y escape hatch, and are then removed in version X.Y+2.

I have declared gpu_prefix_cache_queries and gpu_prefix_cache_hits as deprecated, and introduced the new ones. Could you please take a look at it?

It looks like we need to create separate pull requests for hiding and removing the metrics, following this one gets merged?

documentation=(
"Prefix cache queries, in terms of number of queried tokens."),
labelnames=labelnames).labels(*labelvalues)

self.counter_prefix_cache_hits = self._counter_cls(
name="vllm:prefix_cache_hits",
documentation=(
"Prefix cache hits, in terms of number of cached tokens."),
labelnames=labelnames).labels(*labelvalues)

#
Expand Down Expand Up @@ -399,13 +426,19 @@ def record(self, scheduler_stats: Optional[SchedulerStats],
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage)
self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage)

self.counter_gpu_prefix_cache_queries.inc(
scheduler_stats.prefix_cache_stats.queries)
self.counter_gpu_prefix_cache_hits.inc(
scheduler_stats.prefix_cache_stats.hits)

self.counter_prefix_cache_queries.inc(
scheduler_stats.prefix_cache_stats.queries)
self.counter_prefix_cache_hits.inc(
scheduler_stats.prefix_cache_stats.hits)

if scheduler_stats.spec_decoding_stats is not None:
self.spec_decoding_prom.observe(
scheduler_stats.spec_decoding_stats)
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class SchedulerStats:
num_running_reqs: int = 0
num_waiting_reqs: int = 0

gpu_cache_usage: float = 0.0
kv_cache_usage: float = 0.0

prefix_cache_stats: PrefixCacheStats = field(
default_factory=PrefixCacheStats)
Expand Down