Skip to content

Commit 7f4ecd7

Browse files
committed
[Metrics] Hide deprecated metrics
As per https://docs.vllm.ai/en/stable/serving/metrics.html > when metrics are deprecated in version X.Y, they are hidden in > version X.Y+1 but can be re-enabled using the > --show-hidden-metrics-for-version=X.Y escape hatch, and are then > removed in version X.Y+2. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent cb27dec commit 7f4ecd7

File tree

1 file changed

+84
-76
lines changed

1 file changed

+84
-76
lines changed

vllm/engine/metrics.py

+84-76
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
5252

5353
max_model_len = vllm_config.model_config.max_model_len
5454

55+
# Use this flag to hide metrics that were deprecated in
56+
# a previous release and which will be removed future
57+
self.show_hidden_metrics = \
58+
vllm_config.observability_config.show_hidden_metrics
59+
5560
# System stats
5661
# Scheduler State
5762
self.gauge_scheduler_running = self._gauge_cls(
@@ -76,14 +81,15 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
7681
)
7782

7883
# Deprecated in 0.8 - KV cache offloading is not used in V1
79-
# TODO: in 0.9, only enable if show_hidden_metrics=True
80-
self.gauge_scheduler_swapped = self._gauge_cls(
81-
name="vllm:num_requests_swapped",
82-
documentation=(
83-
"Number of requests swapped to CPU. "
84-
"DEPRECATED: KV cache offloading is not used in V1"),
85-
labelnames=labelnames,
86-
multiprocess_mode="sum")
84+
# Hidden in 0.9, due to be removed in 0.10
85+
if self.show_hidden_metrics:
86+
self.gauge_scheduler_swapped = self._gauge_cls(
87+
name="vllm:num_requests_swapped",
88+
documentation=(
89+
"Number of requests swapped to CPU. "
90+
"DEPRECATED: KV cache offloading is not used in V1"),
91+
labelnames=labelnames,
92+
multiprocess_mode="sum")
8793

8894
# KV Cache Usage in %
8995
self.gauge_gpu_cache_usage = self._gauge_cls(
@@ -93,34 +99,33 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
9399
multiprocess_mode="sum")
94100

95101
# Deprecated in 0.8 - KV cache offloading is not used in V1
96-
# TODO: in 0.9, only enable if show_hidden_metrics=True
97-
self.gauge_cpu_cache_usage = self._gauge_cls(
98-
name="vllm:cpu_cache_usage_perc",
99-
documentation=(
100-
"CPU KV-cache usage. 1 means 100 percent usage. "
101-
"DEPRECATED: KV cache offloading is not used in V1"),
102-
labelnames=labelnames,
103-
multiprocess_mode="sum")
104-
105-
# Deprecated in 0.8 - KV cache offloading is not used in V1
106-
# TODO: in 0.9, only enable if show_hidden_metrics=True
107-
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
108-
name="vllm:cpu_prefix_cache_hit_rate",
109-
documentation=(
110-
"CPU prefix cache block hit rate. "
111-
"DEPRECATED: KV cache offloading is not used in V1"),
112-
labelnames=labelnames,
113-
multiprocess_mode="sum")
102+
# Hidden in 0.9, due to be removed in 0.10
103+
if self.show_hidden_metrics:
104+
self.gauge_cpu_cache_usage = self._gauge_cls(
105+
name="vllm:cpu_cache_usage_perc",
106+
documentation=(
107+
"CPU KV-cache usage. 1 means 100 percent usage. "
108+
"DEPRECATED: KV cache offloading is not used in V1"),
109+
labelnames=labelnames,
110+
multiprocess_mode="sum")
111+
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
112+
name="vllm:cpu_prefix_cache_hit_rate",
113+
documentation=(
114+
"CPU prefix cache block hit rate. "
115+
"DEPRECATED: KV cache offloading is not used in V1"),
116+
labelnames=labelnames,
117+
multiprocess_mode="sum")
114118

115119
# Deprecated in 0.8 - replaced by queries+hits counters in V1
116-
# TODO: in 0.9, only enable if show_hidden_metrics=True
117-
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
118-
name="vllm:gpu_prefix_cache_hit_rate",
119-
documentation=("GPU prefix cache block hit rate. "
120-
"DEPRECATED: use vllm:gpu_prefix_cache_queries and "
121-
"vllm:gpu_prefix_cache_queries in V1"),
122-
labelnames=labelnames,
123-
multiprocess_mode="sum")
120+
# Hidden in 0.9, due to be removed in 0.10
121+
if self.show_hidden_metrics:
122+
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
123+
name="vllm:gpu_prefix_cache_hit_rate",
124+
documentation=("GPU prefix cache block hit rate. "
125+
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
126+
"and vllm:gpu_prefix_cache_queries in V1"),
127+
labelnames=labelnames,
128+
multiprocess_mode="sum")
124129

125130
# Iteration stats
126131
self.counter_num_preemption = self._counter_cls(
@@ -198,33 +203,35 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
198203
labelnames=labelnames,
199204
buckets=request_latency_buckets)
200205
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
201-
# TODO: in 0.9, only enable if show_hidden_metrics=True
202-
self.histogram_time_in_queue_request = self._histogram_cls(
203-
name="vllm:time_in_queue_requests",
204-
documentation=(
205-
"Histogram of time the request spent in the queue in seconds. "
206-
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
207-
labelnames=labelnames,
208-
buckets=request_latency_buckets)
206+
# Hidden in 0.9, due to be removed in 0.10
207+
if self.show_hidden_metrics:
208+
self.histogram_time_in_queue_request = self._histogram_cls(
209+
name="vllm:time_in_queue_requests",
210+
documentation=
211+
("Histogram of time the request spent in the queue in seconds. "
212+
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
213+
labelnames=labelnames,
214+
buckets=request_latency_buckets)
209215

210216
# Deprecated in 0.8 - use prefill/decode/inference time metrics
211-
# TODO: in 0.9, only enable if show_hidden_metrics=True
212-
self.histogram_model_forward_time_request = self._histogram_cls(
213-
name="vllm:model_forward_time_milliseconds",
214-
documentation=(
215-
"Histogram of time spent in the model forward pass in ms. "
216-
"DEPRECATED: use prefill/decode/inference time metrics instead."
217-
),
218-
labelnames=labelnames,
219-
buckets=build_1_2_3_5_8_buckets(3000))
220-
self.histogram_model_execute_time_request = self._histogram_cls(
221-
name="vllm:model_execute_time_milliseconds",
222-
documentation=(
223-
"Histogram of time spent in the model execute function in ms."
224-
"DEPRECATED: use prefill/decode/inference time metrics instead."
225-
),
226-
labelnames=labelnames,
227-
buckets=build_1_2_3_5_8_buckets(3000))
217+
# Hidden in 0.9, due to be removed in 0.10
218+
if self.show_hidden_metrics:
219+
self.histogram_model_forward_time_request = self._histogram_cls(
220+
name="vllm:model_forward_time_milliseconds",
221+
documentation=
222+
("Histogram of time spent in the model forward pass in ms. "
223+
"DEPRECATED: use prefill/decode/inference time metrics instead"
224+
),
225+
labelnames=labelnames,
226+
buckets=build_1_2_3_5_8_buckets(3000))
227+
self.histogram_model_execute_time_request = self._histogram_cls(
228+
name="vllm:model_execute_time_milliseconds",
229+
documentation=
230+
("Histogram of time spent in the model execute function in ms."
231+
"DEPRECATED: use prefill/decode/inference time metrics instead"
232+
),
233+
labelnames=labelnames,
234+
buckets=build_1_2_3_5_8_buckets(3000))
228235

229236
# Metadata
230237
self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -543,11 +550,6 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
543550
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
544551
vllm_config=vllm_config)
545552

546-
# Use this flag to hide metrics that were deprecated in
547-
# a previous release and which will be removed future
548-
self.show_hidden_metrics = \
549-
vllm_config.observability_config.show_hidden_metrics
550-
551553
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
552554
# Convenience function for logging to gauge.
553555
gauge.labels(**self.labels).set(data)
@@ -580,18 +582,21 @@ def _log_prometheus(self, stats: Stats) -> None:
580582
# System state data
581583
self._log_gauge(self.metrics.gauge_scheduler_running,
582584
stats.num_running_sys)
583-
self._log_gauge(self.metrics.gauge_scheduler_swapped,
584-
stats.num_swapped_sys)
585+
if self.show_hidden_metrics:
586+
self._log_gauge(self.metrics.gauge_scheduler_swapped,
587+
stats.num_swapped_sys)
585588
self._log_gauge(self.metrics.gauge_scheduler_waiting,
586589
stats.num_waiting_sys)
587590
self._log_gauge(self.metrics.gauge_gpu_cache_usage,
588591
stats.gpu_cache_usage_sys)
589-
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
590-
stats.cpu_cache_usage_sys)
591-
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
592-
stats.cpu_prefix_cache_hit_rate)
593-
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
594-
stats.gpu_prefix_cache_hit_rate)
592+
if self.show_hidden_metrics:
593+
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
594+
stats.cpu_cache_usage_sys)
595+
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
596+
stats.cpu_prefix_cache_hit_rate)
597+
if self.show_hidden_metrics:
598+
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
599+
stats.gpu_prefix_cache_hit_rate)
595600
# Including max-lora in metric, in future this property of lora
596601
# config maybe extended to be dynamic.
597602
lora_info = {
@@ -631,10 +636,13 @@ def _log_prometheus(self, stats: Stats) -> None:
631636
stats.time_decode_requests)
632637
self._log_histogram(self.metrics.histogram_time_in_queue_request,
633638
stats.time_in_queue_requests)
634-
self._log_histogram(self.metrics.histogram_model_forward_time_request,
635-
stats.model_forward_time_requests)
636-
self._log_histogram(self.metrics.histogram_model_execute_time_request,
637-
stats.model_execute_time_requests)
639+
if self.show_hidden_metrics:
640+
self._log_histogram(
641+
self.metrics.histogram_model_forward_time_request,
642+
stats.model_forward_time_requests)
643+
self._log_histogram(
644+
self.metrics.histogram_model_execute_time_request,
645+
stats.model_execute_time_requests)
638646
# Metadata
639647
finished_reason_counter = CollectionsCounter(
640648
stats.finished_reason_requests)

0 commit comments

Comments
 (0)