Skip to content

Commit 06a0338

Browse files
authored
[V1][Metrics] Add API for accessing in-memory Prometheus metrics (#17010)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 4318c05 commit 06a0338

File tree

10 files changed

+543
-28
lines changed

10 files changed

+543
-28
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ steps:
222222
- pytest -v -s v1/test_serial_utils.py
223223
- pytest -v -s v1/test_utils.py
224224
- pytest -v -s v1/test_oracle.py
225+
- pytest -v -s v1/test_metrics_reader.py
225226
# TODO: accuracy does not match, whether setting
226227
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
227228
- pytest -v -s v1/e2e

examples/offline_inference/eagle.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from transformers import AutoTokenizer
77

88
from vllm import LLM, SamplingParams
9+
from vllm.v1.metrics.reader import Counter, Vector
910

1011

1112
def load_prompts(dataset_path, num_prompts):
@@ -105,30 +106,33 @@ def main():
105106
print(f"generated text: {output.outputs[0].text}")
106107
print("-" * 50)
107108

108-
if not hasattr(outputs, "metrics") or outputs.metrics is None:
109+
try:
110+
metrics = llm.get_metrics()
111+
except AssertionError:
112+
print("Metrics are not supported in the V0 engine.")
109113
return
110114

111-
# calculate the average number of accepted tokens per forward pass, +1 is
112-
# to account for the token from the target model that's always going to be
113-
# accepted
114-
acceptance_counts = [0] * (args.num_spec_tokens + 1)
115-
for output in outputs:
116-
for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
117-
acceptance_counts[step] += count
115+
num_drafts = num_accepted = 0
116+
acceptance_counts = [0] * args.num_spec_tokens
117+
for metric in metrics:
118+
if metric.name == "vllm:spec_decode_num_drafts":
119+
assert isinstance(metric, Counter)
120+
num_drafts += metric.value
121+
elif metric.name == "vllm:spec_decode_num_accepted_tokens":
122+
assert isinstance(metric, Counter)
123+
num_accepted += metric.value
124+
elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
125+
assert isinstance(metric, Vector)
126+
for pos in range(len(metric.values)):
127+
acceptance_counts[pos] += metric.values[pos]
118128

119129
print("-" * 50)
120-
print(
121-
f"mean acceptance length (including bonus tokens): \
122-
{1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}"
123-
)
130+
print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
124131
print("-" * 50)
125132

126133
# print acceptance at each token position
127134
for i in range(len(acceptance_counts)):
128-
print(
129-
f"acceptance at token {i}:"
130-
f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}"
131-
)
135+
print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")
132136

133137

134138
if __name__ == "__main__":

examples/offline_inference/metrics.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
from vllm import LLM, SamplingParams
4+
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
5+
6+
# Sample prompts.
7+
prompts = [
8+
"Hello, my name is",
9+
"The president of the United States is",
10+
"The capital of France is",
11+
"The future of AI is",
12+
]
13+
# Create a sampling params object.
14+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
15+
16+
17+
def main():
18+
# Create an LLM.
19+
llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
20+
21+
# Generate texts from the prompts.
22+
outputs = llm.generate(prompts, sampling_params)
23+
24+
# Print the outputs.
25+
print("-" * 50)
26+
for output in outputs:
27+
prompt = output.prompt
28+
generated_text = output.outputs[0].text
29+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
30+
print("-" * 50)
31+
32+
# Dump all metrics
33+
for metric in llm.get_metrics():
34+
if isinstance(metric, Gauge):
35+
print(f"{metric.name} (gauge) = {metric.value}")
36+
elif isinstance(metric, Counter):
37+
print(f"{metric.name} (counter) = {metric.value}")
38+
elif isinstance(metric, Vector):
39+
print(f"{metric.name} (vector) = {metric.values}")
40+
elif isinstance(metric, Histogram):
41+
print(f"{metric.name} (histogram)")
42+
print(f" sum = {metric.sum}")
43+
print(f" count = {metric.count}")
44+
for bucket_le, value in metric.buckets.items():
45+
print(f" {bucket_le} = {value}")
46+
47+
48+
if __name__ == "__main__":
49+
main()

tests/v1/engine/test_llm_engine.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77

88
from vllm import LLM, SamplingParams
9+
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
910

1011
MODEL = "facebook/opt-125m"
1112
DTYPE = "half"
@@ -97,3 +98,67 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
9798
raise AssertionError(
9899
f"{len(completion_counts)} unique completions; expected"
99100
f" {n}. Repeats: {repeats}")
101+
102+
103+
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
104+
max_tokens = 100
105+
# Use spec decoding to test num_accepted_tokens_per_pos
106+
speculative_config = {
107+
"method": "ngram",
108+
"prompt_lookup_max": 5,
109+
"prompt_lookup_min": 3,
110+
"num_speculative_tokens": 5,
111+
}
112+
monkeypatch.setenv("VLLM_USE_V1", "1")
113+
with vllm_runner(
114+
MODEL,
115+
speculative_config=speculative_config,
116+
disable_log_stats=False,
117+
) as vllm_model:
118+
model: LLM = vllm_model.model
119+
sampling_params = SamplingParams(temperature=0.0,
120+
max_tokens=max_tokens)
121+
outputs = model.generate(example_prompts, sampling_params)
122+
123+
n_prompts = len(example_prompts)
124+
assert len(outputs) == n_prompts
125+
126+
total_tokens = 0
127+
for out in outputs:
128+
assert len(out.outputs) == 1
129+
total_tokens += len(out.outputs[0].token_ids)
130+
assert total_tokens == max_tokens * n_prompts
131+
132+
metrics = model.get_metrics()
133+
134+
def find_metric(name) -> list[Metric]:
135+
found = []
136+
for metric in metrics:
137+
if metric.name == name:
138+
found.append(metric)
139+
return found
140+
141+
num_requests_running = find_metric("vllm:num_requests_running")
142+
assert len(num_requests_running) == 1
143+
assert isinstance(num_requests_running[0], Gauge)
144+
assert num_requests_running[0].value == .0
145+
146+
generation_tokens = find_metric("vllm:generation_tokens")
147+
assert len(generation_tokens) == 1
148+
assert isinstance(generation_tokens[0], Counter)
149+
assert generation_tokens[0].value == total_tokens
150+
151+
request_generation_tokens = find_metric(
152+
"vllm:request_generation_tokens")
153+
assert len(request_generation_tokens) == 1
154+
assert isinstance(request_generation_tokens[0], Histogram)
155+
assert "+Inf" in request_generation_tokens[0].buckets
156+
assert request_generation_tokens[0].buckets["+Inf"] == n_prompts
157+
assert request_generation_tokens[0].count == n_prompts
158+
assert request_generation_tokens[0].sum == total_tokens
159+
160+
num_accepted_tokens_per_pos = find_metric(
161+
"vllm:spec_decode_num_accepted_tokens_per_pos")
162+
assert len(num_accepted_tokens_per_pos) == 1
163+
assert isinstance(num_accepted_tokens_per_pos[0], Vector)
164+
assert len(num_accepted_tokens_per_pos[0].values) == 5

tests/v1/test_metrics_reader.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
import prometheus_client
4+
import pytest
5+
6+
from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
7+
get_metrics_snapshot)
8+
9+
10+
@pytest.fixture(autouse=True)
11+
def test_registry(monkeypatch):
12+
# Use a custom registry for tests
13+
test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
14+
monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
15+
return test_registry
16+
17+
18+
@pytest.mark.parametrize("num_engines", [1, 4])
19+
def test_gauge_metric(test_registry, num_engines):
20+
g = prometheus_client.Gauge("vllm:test_gauge",
21+
"Test gauge metric",
22+
labelnames=["model", "engine_index"],
23+
registry=test_registry)
24+
for i in range(num_engines):
25+
g.labels(model="foo", engine_index=str(i)).set(98.5)
26+
27+
metrics = get_metrics_snapshot()
28+
assert len(metrics) == num_engines
29+
engine_labels = [str(i) for i in range(num_engines)]
30+
for m in metrics:
31+
assert isinstance(m, Gauge)
32+
assert m.name == "vllm:test_gauge"
33+
assert m.value == 98.5
34+
assert m.labels["model"] == "foo"
35+
assert m.labels["engine_index"] in engine_labels
36+
engine_labels.remove(m.labels["engine_index"])
37+
38+
39+
@pytest.mark.parametrize("num_engines", [1, 4])
40+
def test_counter_metric(test_registry, num_engines):
41+
c = prometheus_client.Counter("vllm:test_counter",
42+
"Test counter metric",
43+
labelnames=["model", "engine_index"],
44+
registry=test_registry)
45+
for i in range(num_engines):
46+
c.labels(model="bar", engine_index=str(i)).inc(19)
47+
48+
metrics = get_metrics_snapshot()
49+
assert len(metrics) == num_engines
50+
engine_labels = [str(i) for i in range(num_engines)]
51+
for m in metrics:
52+
assert isinstance(m, Counter)
53+
assert m.name == "vllm:test_counter"
54+
assert m.value == 19
55+
assert m.labels["model"] == "bar"
56+
assert m.labels["engine_index"] in engine_labels
57+
engine_labels.remove(m.labels["engine_index"])
58+
59+
60+
@pytest.mark.parametrize("num_engines", [1, 4])
61+
def test_histogram_metric(test_registry, num_engines):
62+
h = prometheus_client.Histogram("vllm:test_histogram",
63+
"Test histogram metric",
64+
labelnames=["model", "engine_index"],
65+
buckets=[10, 20, 30, 40, 50],
66+
registry=test_registry)
67+
for i in range(num_engines):
68+
hist = h.labels(model="blaa", engine_index=str(i))
69+
hist.observe(42)
70+
hist.observe(21)
71+
hist.observe(7)
72+
73+
metrics = get_metrics_snapshot()
74+
assert len(metrics) == num_engines
75+
engine_labels = [str(i) for i in range(num_engines)]
76+
for m in metrics:
77+
assert isinstance(m, Histogram)
78+
assert m.name == "vllm:test_histogram"
79+
assert m.count == 3
80+
assert m.sum == 70
81+
assert m.buckets["10.0"] == 1
82+
assert m.buckets["20.0"] == 1
83+
assert m.buckets["30.0"] == 2
84+
assert m.buckets["40.0"] == 2
85+
assert m.buckets["50.0"] == 3
86+
assert m.labels["model"] == "blaa"
87+
assert m.labels["engine_index"] in engine_labels
88+
engine_labels.remove(m.labels["engine_index"])
89+
90+
91+
@pytest.mark.parametrize("num_engines", [1, 4])
92+
def test_vector_metric(test_registry, num_engines):
93+
c = prometheus_client.Counter(
94+
"vllm:spec_decode_num_accepted_tokens_per_pos",
95+
"Vector-like counter metric",
96+
labelnames=["position", "model", "engine_index"],
97+
registry=test_registry)
98+
for i in range(num_engines):
99+
c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
100+
c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
101+
c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
102+
103+
metrics = get_metrics_snapshot()
104+
assert len(metrics) == num_engines
105+
engine_labels = [str(i) for i in range(num_engines)]
106+
for m in metrics:
107+
assert isinstance(m, Vector)
108+
assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
109+
assert m.values == [10, 5, 1]
110+
assert m.labels["model"] == "llama"
111+
assert m.labels["engine_index"] in engine_labels
112+
engine_labels.remove(m.labels["engine_index"])

vllm/entrypoints/llm.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import warnings
55
from collections.abc import Sequence
66
from contextlib import contextmanager
7-
from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
7+
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
8+
cast, overload)
89

910
import cloudpickle
1011
import torch.nn as nn
@@ -47,6 +48,9 @@
4748
from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
4849
is_list_of)
4950

51+
if TYPE_CHECKING:
52+
from vllm.v1.metrics.reader import Metric
53+
5054
logger = init_logger(__name__)
5155

5256
_R = TypeVar("_R", default=Any)
@@ -1294,6 +1298,20 @@ def wake_up(self, tags: Optional[list[str]] = None):
12941298
"""
12951299
self.llm_engine.wake_up(tags)
12961300

1301+
def get_metrics(self) -> list["Metric"]:
1302+
"""Return a snapshot of aggregated metrics from Prometheus.
1303+
1304+
Returns:
1305+
A ``MetricSnapshot`` instance capturing the current state
1306+
of all aggregated metrics from Prometheus.
1307+
1308+
Note:
1309+
This method is only available with the V1 LLM engine.
1310+
"""
1311+
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
1312+
assert isinstance(self.llm_engine, V1LLMEngine)
1313+
return self.llm_engine.get_metrics()
1314+
12971315
# LEGACY
12981316
def _convert_v1_inputs(
12991317
self,

0 commit comments

Comments
 (0)