@@ -52,6 +52,11 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
52
52
53
53
max_model_len = vllm_config .model_config .max_model_len
54
54
55
+ # Use this flag to hide metrics that were deprecated in
56
+ # a previous release and which will be removed future
57
+ self .show_hidden_metrics = \
58
+ vllm_config .observability_config .show_hidden_metrics
59
+
55
60
# System stats
56
61
# Scheduler State
57
62
self .gauge_scheduler_running = self ._gauge_cls (
@@ -76,14 +81,15 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
76
81
)
77
82
78
83
# Deprecated in 0.8 - KV cache offloading is not used in V1
79
- # TODO: in 0.9, only enable if show_hidden_metrics=True
80
- self .gauge_scheduler_swapped = self ._gauge_cls (
81
- name = "vllm:num_requests_swapped" ,
82
- documentation = (
83
- "Number of requests swapped to CPU. "
84
- "DEPRECATED: KV cache offloading is not used in V1" ),
85
- labelnames = labelnames ,
86
- multiprocess_mode = "sum" )
84
+ # Hidden in 0.9, due to be removed in 0.10
85
+ if self .show_hidden_metrics :
86
+ self .gauge_scheduler_swapped = self ._gauge_cls (
87
+ name = "vllm:num_requests_swapped" ,
88
+ documentation = (
89
+ "Number of requests swapped to CPU. "
90
+ "DEPRECATED: KV cache offloading is not used in V1" ),
91
+ labelnames = labelnames ,
92
+ multiprocess_mode = "sum" )
87
93
88
94
# KV Cache Usage in %
89
95
self .gauge_gpu_cache_usage = self ._gauge_cls (
@@ -93,34 +99,33 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
93
99
multiprocess_mode = "sum" )
94
100
95
101
# Deprecated in 0.8 - KV cache offloading is not used in V1
96
- # TODO: in 0.9, only enable if show_hidden_metrics=True
97
- self .gauge_cpu_cache_usage = self ._gauge_cls (
98
- name = "vllm:cpu_cache_usage_perc" ,
99
- documentation = (
100
- "CPU KV-cache usage. 1 means 100 percent usage. "
101
- "DEPRECATED: KV cache offloading is not used in V1" ),
102
- labelnames = labelnames ,
103
- multiprocess_mode = "sum" )
104
-
105
- # Deprecated in 0.8 - KV cache offloading is not used in V1
106
- # TODO: in 0.9, only enable if show_hidden_metrics=True
107
- self .gauge_cpu_prefix_cache_hit_rate = self ._gauge_cls (
108
- name = "vllm:cpu_prefix_cache_hit_rate" ,
109
- documentation = (
110
- "CPU prefix cache block hit rate. "
111
- "DEPRECATED: KV cache offloading is not used in V1" ),
112
- labelnames = labelnames ,
113
- multiprocess_mode = "sum" )
102
+ # Hidden in 0.9, due to be removed in 0.10
103
+ if self .show_hidden_metrics :
104
+ self .gauge_cpu_cache_usage = self ._gauge_cls (
105
+ name = "vllm:cpu_cache_usage_perc" ,
106
+ documentation = (
107
+ "CPU KV-cache usage. 1 means 100 percent usage. "
108
+ "DEPRECATED: KV cache offloading is not used in V1" ),
109
+ labelnames = labelnames ,
110
+ multiprocess_mode = "sum" )
111
+ self .gauge_cpu_prefix_cache_hit_rate = self ._gauge_cls (
112
+ name = "vllm:cpu_prefix_cache_hit_rate" ,
113
+ documentation = (
114
+ "CPU prefix cache block hit rate. "
115
+ "DEPRECATED: KV cache offloading is not used in V1" ),
116
+ labelnames = labelnames ,
117
+ multiprocess_mode = "sum" )
114
118
115
119
# Deprecated in 0.8 - replaced by queries+hits counters in V1
116
- # TODO: in 0.9, only enable if show_hidden_metrics=True
117
- self .gauge_gpu_prefix_cache_hit_rate = self ._gauge_cls (
118
- name = "vllm:gpu_prefix_cache_hit_rate" ,
119
- documentation = ("GPU prefix cache block hit rate. "
120
- "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
121
- "vllm:gpu_prefix_cache_queries in V1" ),
122
- labelnames = labelnames ,
123
- multiprocess_mode = "sum" )
120
+ # Hidden in 0.9, due to be removed in 0.10
121
+ if self .show_hidden_metrics :
122
+ self .gauge_gpu_prefix_cache_hit_rate = self ._gauge_cls (
123
+ name = "vllm:gpu_prefix_cache_hit_rate" ,
124
+ documentation = ("GPU prefix cache block hit rate. "
125
+ "DEPRECATED: use vllm:gpu_prefix_cache_queries "
126
+ "and vllm:gpu_prefix_cache_queries in V1" ),
127
+ labelnames = labelnames ,
128
+ multiprocess_mode = "sum" )
124
129
125
130
# Iteration stats
126
131
self .counter_num_preemption = self ._counter_cls (
@@ -198,33 +203,35 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
198
203
labelnames = labelnames ,
199
204
buckets = request_latency_buckets )
200
205
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
201
- # TODO: in 0.9, only enable if show_hidden_metrics=True
202
- self .histogram_time_in_queue_request = self ._histogram_cls (
203
- name = "vllm:time_in_queue_requests" ,
204
- documentation = (
205
- "Histogram of time the request spent in the queue in seconds. "
206
- "DEPRECATED: use vllm:request_queue_time_seconds instead." ),
207
- labelnames = labelnames ,
208
- buckets = request_latency_buckets )
206
+ # Hidden in 0.9, due to be removed in 0.10
207
+ if self .show_hidden_metrics :
208
+ self .histogram_time_in_queue_request = self ._histogram_cls (
209
+ name = "vllm:time_in_queue_requests" ,
210
+ documentation =
211
+ ("Histogram of time the request spent in the queue in seconds. "
212
+ "DEPRECATED: use vllm:request_queue_time_seconds instead." ),
213
+ labelnames = labelnames ,
214
+ buckets = request_latency_buckets )
209
215
210
216
# Deprecated in 0.8 - use prefill/decode/inference time metrics
211
- # TODO: in 0.9, only enable if show_hidden_metrics=True
212
- self .histogram_model_forward_time_request = self ._histogram_cls (
213
- name = "vllm:model_forward_time_milliseconds" ,
214
- documentation = (
215
- "Histogram of time spent in the model forward pass in ms. "
216
- "DEPRECATED: use prefill/decode/inference time metrics instead."
217
- ),
218
- labelnames = labelnames ,
219
- buckets = build_1_2_3_5_8_buckets (3000 ))
220
- self .histogram_model_execute_time_request = self ._histogram_cls (
221
- name = "vllm:model_execute_time_milliseconds" ,
222
- documentation = (
223
- "Histogram of time spent in the model execute function in ms."
224
- "DEPRECATED: use prefill/decode/inference time metrics instead."
225
- ),
226
- labelnames = labelnames ,
227
- buckets = build_1_2_3_5_8_buckets (3000 ))
217
+ # Hidden in 0.9, due to be removed in 0.10
218
+ if self .show_hidden_metrics :
219
+ self .histogram_model_forward_time_request = self ._histogram_cls (
220
+ name = "vllm:model_forward_time_milliseconds" ,
221
+ documentation =
222
+ ("Histogram of time spent in the model forward pass in ms. "
223
+ "DEPRECATED: use prefill/decode/inference time metrics instead"
224
+ ),
225
+ labelnames = labelnames ,
226
+ buckets = build_1_2_3_5_8_buckets (3000 ))
227
+ self .histogram_model_execute_time_request = self ._histogram_cls (
228
+ name = "vllm:model_execute_time_milliseconds" ,
229
+ documentation =
230
+ ("Histogram of time spent in the model execute function in ms."
231
+ "DEPRECATED: use prefill/decode/inference time metrics instead"
232
+ ),
233
+ labelnames = labelnames ,
234
+ buckets = build_1_2_3_5_8_buckets (3000 ))
228
235
229
236
# Metadata
230
237
self .histogram_num_prompt_tokens_request = self ._histogram_cls (
@@ -543,11 +550,6 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
543
550
self .metrics = self ._metrics_cls (labelnames = list (labels .keys ()),
544
551
vllm_config = vllm_config )
545
552
546
- # Use this flag to hide metrics that were deprecated in
547
- # a previous release and which will be removed future
548
- self .show_hidden_metrics = \
549
- vllm_config .observability_config .show_hidden_metrics
550
-
551
553
def _log_gauge (self , gauge , data : Union [int , float ]) -> None :
552
554
# Convenience function for logging to gauge.
553
555
gauge .labels (** self .labels ).set (data )
@@ -580,18 +582,21 @@ def _log_prometheus(self, stats: Stats) -> None:
580
582
# System state data
581
583
self ._log_gauge (self .metrics .gauge_scheduler_running ,
582
584
stats .num_running_sys )
583
- self ._log_gauge (self .metrics .gauge_scheduler_swapped ,
584
- stats .num_swapped_sys )
585
+ if self .show_hidden_metrics :
586
+ self ._log_gauge (self .metrics .gauge_scheduler_swapped ,
587
+ stats .num_swapped_sys )
585
588
self ._log_gauge (self .metrics .gauge_scheduler_waiting ,
586
589
stats .num_waiting_sys )
587
590
self ._log_gauge (self .metrics .gauge_gpu_cache_usage ,
588
591
stats .gpu_cache_usage_sys )
589
- self ._log_gauge (self .metrics .gauge_cpu_cache_usage ,
590
- stats .cpu_cache_usage_sys )
591
- self ._log_gauge (self .metrics .gauge_cpu_prefix_cache_hit_rate ,
592
- stats .cpu_prefix_cache_hit_rate )
593
- self ._log_gauge (self .metrics .gauge_gpu_prefix_cache_hit_rate ,
594
- stats .gpu_prefix_cache_hit_rate )
592
+ if self .show_hidden_metrics :
593
+ self ._log_gauge (self .metrics .gauge_cpu_cache_usage ,
594
+ stats .cpu_cache_usage_sys )
595
+ self ._log_gauge (self .metrics .gauge_cpu_prefix_cache_hit_rate ,
596
+ stats .cpu_prefix_cache_hit_rate )
597
+ if self .show_hidden_metrics :
598
+ self ._log_gauge (self .metrics .gauge_gpu_prefix_cache_hit_rate ,
599
+ stats .gpu_prefix_cache_hit_rate )
595
600
# Including max-lora in metric, in future this property of lora
596
601
# config maybe extended to be dynamic.
597
602
lora_info = {
@@ -631,10 +636,13 @@ def _log_prometheus(self, stats: Stats) -> None:
631
636
stats .time_decode_requests )
632
637
self ._log_histogram (self .metrics .histogram_time_in_queue_request ,
633
638
stats .time_in_queue_requests )
634
- self ._log_histogram (self .metrics .histogram_model_forward_time_request ,
635
- stats .model_forward_time_requests )
636
- self ._log_histogram (self .metrics .histogram_model_execute_time_request ,
637
- stats .model_execute_time_requests )
639
+ if self .show_hidden_metrics :
640
+ self ._log_histogram (
641
+ self .metrics .histogram_model_forward_time_request ,
642
+ stats .model_forward_time_requests )
643
+ self ._log_histogram (
644
+ self .metrics .histogram_model_execute_time_request ,
645
+ stats .model_execute_time_requests )
638
646
# Metadata
639
647
finished_reason_counter = CollectionsCounter (
640
648
stats .finished_reason_requests )
0 commit comments