vllm-project · bbartels · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -44,4 +44,9 @@ watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
+opentelemetry-sdk>=1.26.0  # vllm.tracing
+opentelemetry-api>=1.26.0  # vllm.tracing
+opentelemetry-exporter-otlp>=1.26.0  # vllm.tracing
+opentelemetry-semantic-conventions-ai>=0.4.1  # vllm.tracing
+opentelemetry-instrumentation-fastapi>=0.47b0
 pybase64 # fast base64 implementation
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -12,7 +12,8 @@
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
+                         VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -1115,6 +1116,10 @@ async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         return self.engine.get_model_config()
 
+    async def get_observability_config(self) -> "ObservabilityConfig":
+        """Get the observability configuration of the vLLM engine."""
+        return self.engine.get_observability_config()
+
     async def get_parallel_config(self) -> ParallelConfig:
         """Get the parallel configuration of the vLLM engine."""
         return self.engine.get_parallel_config()

@@ -832,6 +832,10 @@ def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
 
+    def get_observability_config(self) -> ObservabilityConfig:
+        """Gets the observability configuration."""
+        return self.observability_config
+
     def get_parallel_config(self) -> ParallelConfig:
         """Gets the parallel configuration."""
         return self.parallel_config

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -16,7 +16,8 @@
 from zmq.asyncio import Socket
 
 from vllm import PoolingParams
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
+                         VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -96,6 +97,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         # Get the configs.
         self.vllm_config = engine_config
         self.model_config = engine_config.model_config
+        self.observability_config = engine_config.observability_config
         self.decoding_config = engine_config.decoding_config
 
         # Create the tokenizer group.
@@ -387,6 +389,9 @@ async def get_decoding_config(self) -> DecodingConfig:
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
+    async def get_observability_config(self) -> ObservabilityConfig:
+        return self.observability_config
+
     async def is_tracing_enabled(self) -> bool:
         return self.tracing_flag
 

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -6,7 +6,8 @@
 from typing import AsyncGenerator, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
+                         VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -249,6 +250,11 @@ async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         ...
 
+    @abstractmethod
+    async def get_observability_config(self) -> "ObservabilityConfig":
+        """Get the observability configuration of the vLLM engine."""
+        ...
+
     @abstractmethod
     async def get_decoding_config(self) -> DecodingConfig:
         """Get the decoding configuration of the vLLM engine."""

@@ -35,7 +35,7 @@
 from typing_extensions import assert_never
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import ObservabilityConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -111,6 +111,25 @@
 _running_tasks: set[asyncio.Task] = set()
 
 
+def setup_otel(app: FastAPI, observability_config: ObservabilityConfig):
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+        OTLPSpanExporter)
+    from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+
+    trace.set_tracer_provider(TracerProvider(resource=Resource.create()))
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint=observability_config.otlp_traces_endpoint)
+    trace.get_tracer_provider().add_span_processor(
+        BatchSpanProcessor(otlp_exporter))
+
+    FastAPIInstrumentor().instrument_app(app)
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
@@ -1014,7 +1033,8 @@ def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
         return None
 
 
-def build_app(args: Namespace) -> FastAPI:
+def build_app(args: Namespace,
+              observability_config: ObservabilityConfig) -> FastAPI:
     if args.disable_fastapi_docs:
         app = FastAPI(openapi_url=None,
                       docs_url=None,
@@ -1025,6 +1045,9 @@ def build_app(args: Namespace) -> FastAPI:
     app.include_router(router)
     app.root_path = args.root_path
 
+    if observability_config.otlp_traces_endpoint is not None:
+        setup_otel(app, observability_config)
+
     mount_metrics(app)
 
     app.add_middleware(
@@ -1343,7 +1366,8 @@ async def run_server_worker(listen_address,
         uvicorn_kwargs['log_config'] = log_config
 
     async with build_async_engine_client(args, client_config) as engine_client:
-        app = build_app(args)
+        observability_config = await engine_client.get_observability_config()
+        app = build_app(args, observability_config)
 
         vllm_config = await engine_client.get_vllm_config()
         await init_app_state(engine_client, vllm_config, app.state, args)

@@ -8,7 +8,7 @@
 import numpy as np
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, ObservabilityConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
@@ -93,6 +93,7 @@ def __init__(
         self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
+        self.observability_config = vllm_config.observability_config
 
         # Set up stat loggers; independent set for each DP rank.
         self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers(
@@ -526,6 +527,9 @@ async def get_vllm_config(self) -> VllmConfig:
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
+    async def get_observability_config(self) -> ObservabilityConfig:
+        return self.observability_config
+
     async def get_decoding_config(self):
         raise ValueError("Not Supported on V1 yet.")