Skip to content

Adds OTEL instrumentation to OpenAI API server #19987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,9 @@ watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing
opentelemetry-api>=1.26.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing
opentelemetry-instrumentation-fastapi>=0.47b0
pybase64 # fast base64 implementation
7 changes: 6 additions & 1 deletion vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
ObservabilityConfig, ParallelConfig, SchedulerConfig,
VllmConfig)
from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_timeout import asyncio_timeout
Expand Down Expand Up @@ -1115,6 +1116,10 @@ async def get_model_config(self) -> ModelConfig:
"""Get the model configuration of the vLLM engine."""
return self.engine.get_model_config()

async def get_observability_config(self) -> "ObservabilityConfig":
"""Get the observability configuration of the vLLM engine."""
return self.engine.get_observability_config()

async def get_parallel_config(self) -> ParallelConfig:
"""Get the parallel configuration of the vLLM engine."""
return self.engine.get_parallel_config()
Expand Down
4 changes: 4 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,10 @@ def get_model_config(self) -> ModelConfig:
"""Gets the model configuration."""
return self.model_config

def get_observability_config(self) -> ObservabilityConfig:
"""Gets the observability configuration."""
return self.observability_config

def get_parallel_config(self) -> ParallelConfig:
"""Gets the parallel configuration."""
return self.parallel_config
Expand Down
7 changes: 6 additions & 1 deletion vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from zmq.asyncio import Socket

from vllm import PoolingParams
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
VllmConfig)
from vllm.core.scheduler import SchedulerOutputs
# yapf conflicts with isort for this block
# yapf: disable
Expand Down Expand Up @@ -96,6 +97,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
# Get the configs.
self.vllm_config = engine_config
self.model_config = engine_config.model_config
self.observability_config = engine_config.observability_config
self.decoding_config = engine_config.decoding_config

# Create the tokenizer group.
Expand Down Expand Up @@ -387,6 +389,9 @@ async def get_decoding_config(self) -> DecodingConfig:
async def get_model_config(self) -> ModelConfig:
return self.model_config

async def get_observability_config(self) -> ObservabilityConfig:
return self.observability_config

async def is_tracing_enabled(self) -> bool:
return self.tracing_flag

Expand Down
8 changes: 7 additions & 1 deletion vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from typing import AsyncGenerator, Mapping, Optional

from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
VllmConfig)
from vllm.core.scheduler import SchedulerOutputs
from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
Expand Down Expand Up @@ -249,6 +250,11 @@ async def get_model_config(self) -> ModelConfig:
"""Get the model configuration of the vLLM engine."""
...

@abstractmethod
async def get_observability_config(self) -> "ObservabilityConfig":
"""Get the observability configuration of the vLLM engine."""
...

@abstractmethod
async def get_decoding_config(self) -> DecodingConfig:
"""Get the decoding configuration of the vLLM engine."""
Expand Down
30 changes: 27 additions & 3 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from typing_extensions import assert_never

import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.config import ObservabilityConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore
from vllm.engine.multiprocessing.client import MQLLMEngineClient
Expand Down Expand Up @@ -111,6 +111,25 @@
_running_tasks: set[asyncio.Task] = set()


def setup_otel(app: FastAPI, observability_config: ObservabilityConfig):
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
OTLPSpanExporter)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

trace.set_tracer_provider(TracerProvider(resource=Resource.create()))

otlp_exporter = OTLPSpanExporter(
endpoint=observability_config.otlp_traces_endpoint)
trace.get_tracer_provider().add_span_processor(
BatchSpanProcessor(otlp_exporter))

FastAPIInstrumentor().instrument_app(app)


@asynccontextmanager
async def lifespan(app: FastAPI):
try:
Expand Down Expand Up @@ -1014,7 +1033,8 @@ def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
return None


def build_app(args: Namespace) -> FastAPI:
def build_app(args: Namespace,
observability_config: ObservabilityConfig) -> FastAPI:
if args.disable_fastapi_docs:
app = FastAPI(openapi_url=None,
docs_url=None,
Expand All @@ -1025,6 +1045,9 @@ def build_app(args: Namespace) -> FastAPI:
app.include_router(router)
app.root_path = args.root_path

if observability_config.otlp_traces_endpoint is not None:
setup_otel(app, observability_config)

mount_metrics(app)

app.add_middleware(
Expand Down Expand Up @@ -1343,7 +1366,8 @@ async def run_server_worker(listen_address,
uvicorn_kwargs['log_config'] = log_config

async with build_async_engine_client(args, client_config) as engine_client:
app = build_app(args)
observability_config = await engine_client.get_observability_config()
app = build_app(args, observability_config)

vllm_config = await engine_client.get_vllm_config()
await init_app_state(engine_client, vllm_config, app.state, args)
Expand Down
6 changes: 5 additions & 1 deletion vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np

import vllm.envs as envs
from vllm.config import ModelConfig, VllmConfig
from vllm.config import ModelConfig, ObservabilityConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
Expand Down Expand Up @@ -93,6 +93,7 @@ def __init__(
self.vllm_config = vllm_config
self.log_requests = log_requests
self.log_stats = log_stats
self.observability_config = vllm_config.observability_config

# Set up stat loggers; independent set for each DP rank.
self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers(
Expand Down Expand Up @@ -526,6 +527,9 @@ async def get_vllm_config(self) -> VllmConfig:
async def get_model_config(self) -> ModelConfig:
return self.model_config

async def get_observability_config(self) -> ObservabilityConfig:
return self.observability_config

async def get_decoding_config(self):
raise ValueError("Not Supported on V1 yet.")

Expand Down