Skip to content

Add support for prometheus metrics #1662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate fschat
pip install accelerate fschat aioprometheus

COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
Expand Down
53 changes: 41 additions & 12 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
get_tokenizer)
from vllm.utils import Counter

try:
from aioprometheus import Gauge
_prometheus_available = True
except ImportError:
_prometheus_available = False

if ray:
from ray.air.util.torch_dist import init_torch_dist_process_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
Expand All @@ -28,6 +34,19 @@
logger = init_logger(__name__)

_LOGGING_INTERVAL_SEC = 5
if _prometheus_available:
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput",
"Avg prefill throughput")
gauge_avg_generation_throughput = Gauge("vllm:avg_generation_throughput",
"Avg prefill throughput")
gauge_scheduler_running = Gauge("vllm:scheduler_running",
"Num requests running")
gauge_scheduler_swapped = Gauge("vllm:scheduler_swapped",
"Num requests swapped")
gauge_scheduler_waiting = Gauge("vllm:scheduler_waiting",
"Num requests waiting")
gauge_gpu_cache_usage = Gauge("vllm:gpu_cache_usage", "GPU KV-cache usage")
gauge_cpu_cache_usage = Gauge("vllm:cpu_cache_usage", "CPU KV-cache usage")


class LLMEngine:
Expand Down Expand Up @@ -581,8 +600,8 @@ def _log_system_stats(
else:
self.num_generation_tokens.append((now, num_batched_tokens))

elapsed_time = now - self.last_logging_time
if elapsed_time < _LOGGING_INTERVAL_SEC:
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
if not (should_log or _prometheus_available):
return

# Discard the old stats.
Expand Down Expand Up @@ -621,16 +640,26 @@ def _log_system_stats(
else:
cpu_cache_usage = 0.0

logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: "
f"{avg_generation_throughput:.1f} tokens/s, "
f"Running: {len(self.scheduler.running)} reqs, "
f"Swapped: {len(self.scheduler.swapped)} reqs, "
f"Pending: {len(self.scheduler.waiting)} reqs, "
f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
self.last_logging_time = now
if _prometheus_available:
gauge_avg_prompt_throughput.set({}, avg_prompt_throughput)
gauge_avg_generation_throughput.set({}, avg_generation_throughput)
gauge_scheduler_running.set({}, len(self.scheduler.running))
gauge_scheduler_swapped.set({}, len(self.scheduler.swapped))
gauge_scheduler_waiting.set({}, len(self.scheduler.waiting))
gauge_gpu_cache_usage.set({}, gpu_cache_usage)
gauge_cpu_cache_usage.set({}, cpu_cache_usage)

if should_log:
logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: "
f"{avg_generation_throughput:.1f} tokens/s, "
f"Running: {len(self.scheduler.running)} reqs, "
f"Swapped: {len(self.scheduler.swapped)} reqs, "
f"Pending: {len(self.scheduler.waiting)} reqs, "
f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%,")
self.last_logging_time = now

def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
"""Decodes the new token for a sequence."""
Expand Down
11 changes: 11 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,24 @@
except ImportError:
_fastchat_available = False

try:
from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
_prometheus_available = True
except ImportError:
_prometheus_available = False

TIMEOUT_KEEP_ALIVE = 5 # seconds

logger = init_logger(__name__)
served_model = None
app = fastapi.FastAPI()
engine = None

if _prometheus_available:
app.add_middleware(MetricsMiddleware)
app.add_route("/metrics", metrics)


def create_error_response(status_code: HTTPStatus,
message: str) -> JSONResponse:
Expand Down