Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1][Metrics] Add per-request prompt/generation_tokens histograms #12516

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:num_requests_waiting",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
]


Expand Down
3 changes: 1 addition & 2 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ def __init__(
self.log_stats = log_stats
self.stat_loggers: List[StatLoggerBase] = [
LoggingStatLogger(),
PrometheusStatLogger(labels=dict(
model_name=self.model_config.served_model_name)),
PrometheusStatLogger(vllm_config.model_config),
]

# Tokenizer (+ ensure liveness if running in another process).
Expand Down
11 changes: 9 additions & 2 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine.detokenizer import (DetokenizerOutput,
IncrementalDetokenizer)
from vllm.v1.metrics.stats import IterationStats
from vllm.v1.metrics.stats import IterationStats, RequestStateStats


@dataclass
Expand Down Expand Up @@ -37,6 +37,8 @@ def __init__(
self.is_prefilling = True
self.queue = queue

self.stats = RequestStateStats()

@classmethod
def from_new_request(
cls,
Expand Down Expand Up @@ -146,7 +148,8 @@ def process_outputs(
# 1) Compute stats for this iteration.
iteration_stats.update_from_output(engine_core_output,
req_state.is_prefilling,
req_state.prompt_len)
req_state.prompt_len,
req_state.stats)
req_state.is_prefilling = False

# 2) Detokenize the token ids into text.
Expand All @@ -171,6 +174,10 @@ def process_outputs(
# detected stop string, abort needed in EngineCore.
reqs_to_abort.append(req_id)

# Track per-request stats
iteration_stats.update_from_finished_request(
request_output, req_state.stats)

return OutputProcessorOutput(
request_outputs=request_outputs,
reqs_to_abort=reqs_to_abort,
Expand Down
60 changes: 54 additions & 6 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import time
from abc import ABC, abstractmethod
from typing import Dict, List
from typing import List

import numpy as np
import prometheus_client

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

Expand Down Expand Up @@ -78,13 +79,13 @@ def log(self, scheduler_stats: SchedulerStats,

class PrometheusStatLogger(StatLoggerBase):

def __init__(self, labels: Dict[str, str]):
self.labels = labels
def __init__(self, model_config: ModelConfig):
self._unregister_vllm_metrics()

labelnames = self.labels.keys()
labelvalues = self.labels.values()
labelnames = ["model_name"]
labelvalues = [model_config.served_model_name]

self._unregister_vllm_metrics()
max_model_len = model_config.max_model_len

self.gauge_scheduler_running = prometheus_client.Gauge(
name="vllm:num_requests_running",
Expand All @@ -106,6 +107,20 @@ def __init__(self, labels: Dict[str, str]):
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_prompt_tokens",
documentation="Number of prefill tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

self.histogram_num_generation_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_generation_tokens",
documentation="Number of generation tokens processed.",
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus."""
Expand All @@ -116,9 +131,42 @@ def log(self, scheduler_stats: SchedulerStats,
self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens)

for finished_request in iteration_stats.finished_requests:
self.histogram_num_prompt_tokens_request.observe(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

QQ, in V0, we did:

histogram.labels(**self.labels).observe(datum)

histogram.labels(**self.labels).observe(datum)

Do you know why this is or is not needed?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ill hit automerge to unblock you

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

labels() is a pretty expensive factory method and our labels aren't ever changing, so I'm building these labelled metrics in the PrometheusStatLogger constructor

self.histogram_num_generation_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_generation_tokens",
                documentation="Number of generation tokens processed.",
                buckets=build_1_2_5_buckets(max_model_len),
                labelnames=labelnames).labels(*labelvalues)

Not so easy to spot it though!

finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe(
finished_request.num_generation_tokens)

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
for collector in list(prometheus_client.REGISTRY._collector_to_names):
if hasattr(collector, "_name") and "vllm" in collector._name:
prometheus_client.REGISTRY.unregister(collector)


def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.

"""
exponent = 0
buckets: List[int] = []
while True:
for m in mantissa_lst:
value = m * 10**exponent
if value <= max_value:
buckets.append(value)
else:
return buckets
exponent += 1


def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)
36 changes: 32 additions & 4 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, List

if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput


Expand All @@ -16,24 +17,51 @@ class SchedulerStats:
# gpu_prefix_cache_hit_rate: float = 0.0


@dataclass
class RequestStateStats:
"""Stats that need to be tracked across delta updates."""

num_generation_tokens: int = 0


@dataclass
class FinishedRequestStats:
"""Stats associated with a finished request."""

num_prompt_tokens: int = 0
num_generation_tokens: int = 0


class IterationStats:
"""Stats associated with a single set of EngineCoreOutputs."""

def __init__(self, log_stats: bool):
self.log_stats = log_stats
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.finished_requests: List[FinishedRequestStats] = []

def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int):
is_prefilling: bool, prompt_len: int,
request_state_stats: RequestStateStats):
if not self.log_stats:
return

self.num_generation_tokens += len(output.new_token_ids)
num_new_generation_tokens = len(output.new_token_ids)

self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
# This relies on the invariant that EngineCore does
# not stream outputs for partially completed prefills
# (scheduler.update_from_output makes EngineCoreOutput
# iff num_computed_tokens == num_tokens).
assert (len(output.new_token_ids) > 0)
assert (num_new_generation_tokens > 0)
self.num_prompt_tokens += prompt_len

request_state_stats.num_generation_tokens += num_new_generation_tokens

def update_from_finished_request(self, request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens))