@@ -305,12 +305,49 @@ async def start(self):
305305 logger .info ("Started vLLM engine." )
306306
307307 async def _start_engine (self ) -> "EngineClient" :
308+ from vllm import envs
309+
310+ # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+ # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+ # if any feature specified in the engine config is not supported, then
313+ # it falls back to v0. Note that launching vLLM on a non-main thread
314+ # is an experimental feature, so vLLM will fall back to v0 in this case.
315+ # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+ # experimental features (such as launching vLLM on a non-main thread).
317+ # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+ # In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used.
319+ if not envs .is_set ("VLLM_USE_V1" ):
320+ raise AssertionError (
321+ "Starting from Ray 2.45, VLLM_USE_V1 environment variable must be "
322+ "set to prevent undetermined behavior"
323+ )
324+ if not envs .VLLM_USE_V1 :
325+ return await self ._start_engine_v0 ()
326+ return await self ._start_engine_v1 ()
327+
328+ async def _start_engine_v1 (self ) -> "EngineClient" :
329+ """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
330+ to get the engine args and don't use _get_vllm_engine_config, because
331+ we integrate vLLM v1 using the highest-level async engine API.
332+ TODO: Refactor vLLM v0 integration to use the same async engine API
333+ to simplify the code.
334+ """
335+ from vllm import AsyncLLMEngine
336+
337+ await self .initialize_node (self .llm_config )
338+ engine_args = _get_async_engine_args (self .llm_config )
339+
340+ return AsyncLLMEngine .from_engine_args (
341+ engine_args = engine_args ,
342+ )
343+
344+ async def _start_engine_v0 (self ) -> "EngineClient" :
308345 from vllm .engine .multiprocessing .client import MQLLMEngineClient
309346
310347 args : InitializeNodeOutput = await self .initialize_node (self .llm_config )
311348 engine_args , engine_config = _get_vllm_engine_config (self .llm_config )
312349
313- if MQLLMEngineClient .is_unsupported_config (engine_args ):
350+ if MQLLMEngineClient .is_unsupported_config (engine_config ):
314351 # If the engine is not supported, we fall back to the legacy async engine.
315352 #
316353 # Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +539,36 @@ async def _generate(
502539 )
503540
504541 if request_output is not None :
505- time_in_queue_histogram .observe (request_output .metrics .time_in_queue )
506542 total_request_time = time .perf_counter () - start
507- generation_time = (
508- total_request_time - request_output .metrics .time_in_queue
509- )
543+ if request_output .metrics is None :
544+ # vLLM V1 metrics are not included in the request output yet.
545+ queue_time = "N/A"
546+ generation_time_str = "N/A"
547+ tokens_s = "N/A"
548+ generated_tokens_s = "N/A"
549+ else :
550+ time_in_queue_histogram .observe (
551+ request_output .metrics .time_in_queue
552+ )
553+ queue_time = f"{ request_output .metrics .time_in_queue } s"
554+ generation_time = (
555+ total_request_time - request_output .metrics .time_in_queue
556+ )
557+ generation_time_str = f"{ generation_time } s"
558+ tokens_s = (
559+ num_input_tokens + all_tokens_collected
560+ ) / generation_time
561+ generated_tokens_s = all_tokens_collected / generation_time
562+
510563 logger .info (
511564 f"Request { vllm_generation_request .request_id } finished ({ finish_reason } ). "
512565 f"Total time: { total_request_time } s, "
513- f"Queue time: { request_output . metrics . time_in_queue } s , "
514- f"Generation+async time: { generation_time } s , "
566+ f"Queue time: { queue_time } , "
567+ f"Generation+async time: { generation_time_str } , "
515568 f"Input tokens: { num_input_tokens } , "
516569 f"Generated tokens: { all_tokens_collected } , "
517- f"tokens/s: { ( num_input_tokens + all_tokens_collected ) / generation_time } , "
518- f"generated tokens/s: { all_tokens_collected / generation_time } ."
570+ f"tokens/s: { tokens_s } , "
571+ f"generated tokens/s: { generated_tokens_s } ."
519572 )
520573 else :
521574 logger .warning (
0 commit comments