@@ -305,12 +305,50 @@ async def start(self):
305305 logger .info ("Started vLLM engine." )
306306
307307 async def _start_engine (self ) -> "EngineClient" :
308+ from vllm import envs
309+
310+ # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+ # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+ # if any feature specified in the engine config is not supported, then
313+ # it falls back to v0. Note that launching vLLM on a non-main thread
314+ # is an experimental feature, so vLLM will fall back to v0 in this case.
315+ # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+ # experimental features (such as launching vLLM on a non-main thread).
317+ # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+ # In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used.
319+ if not envs .is_set ("VLLM_USE_V1" ):
320+ logger .warning (
321+ "VLLM_USE_V1 environment variable is not set, using vLLM v0 as default. "
322+ "Later we may switch default to use v1 once vLLM v1 is mature."
323+ )
324+ envs .set_vllm_use_v1 (False )
325+ if not envs .VLLM_USE_V1 :
326+ return await self ._start_engine_v0 ()
327+ return await self ._start_engine_v1 ()
328+
329+ async def _start_engine_v1 (self ) -> "EngineClient" :
330+ """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
331+ to get the engine args and don't use _get_vllm_engine_config, because
332+ we integrate vLLM v1 using the highest-level async engine API.
333+ TODO: Refactor vLLM v0 integration to use the same async engine API
334+ to simplify the code.
335+ """
336+ from vllm import AsyncLLMEngine
337+
338+ await self .initialize_node (self .llm_config )
339+ engine_args = _get_async_engine_args (self .llm_config )
340+
341+ return AsyncLLMEngine .from_engine_args (
342+ engine_args = engine_args ,
343+ )
344+
345+ async def _start_engine_v0 (self ) -> "EngineClient" :
308346 from vllm .engine .multiprocessing .client import MQLLMEngineClient
309347
310348 args : InitializeNodeOutput = await self .initialize_node (self .llm_config )
311349 engine_args , engine_config = _get_vllm_engine_config (self .llm_config )
312350
313- if MQLLMEngineClient .is_unsupported_config (engine_args ):
351+ if MQLLMEngineClient .is_unsupported_config (engine_config ):
314352 # If the engine is not supported, we fall back to the legacy async engine.
315353 #
316354 # Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -342,6 +380,11 @@ async def _start_mq_engine(
342380 placement_group = placement_group ,
343381 placement_group_capture_child_tasks = True ,
344382 ),
383+ runtime_env = dict (
384+ env_vars = dict (
385+ VLLM_USE_V1 = "0" ,
386+ ),
387+ ),
345388 )(_EngineBackgroundProcess )
346389 # Run the process in the background
347390 process_ref = BackgroundCls .remote (ipc_path , engine_args , engine_config )
@@ -502,20 +545,36 @@ async def _generate(
502545 )
503546
504547 if request_output is not None :
505- time_in_queue_histogram .observe (request_output .metrics .time_in_queue )
506548 total_request_time = time .perf_counter () - start
507- generation_time = (
508- total_request_time - request_output .metrics .time_in_queue
509- )
549+ if request_output .metrics is None :
550+ # vLLM V1 metrics are not included in the request output yet.
551+ queue_time = "N/A"
552+ generation_time_str = "N/A"
553+ tokens_s = "N/A"
554+ generated_tokens_s = "N/A"
555+ else :
556+ time_in_queue_histogram .observe (
557+ request_output .metrics .time_in_queue
558+ )
559+ queue_time = f"{ request_output .metrics .time_in_queue } s"
560+ generation_time = (
561+ total_request_time - request_output .metrics .time_in_queue
562+ )
563+ generation_time_str = f"{ generation_time } s"
564+ tokens_s = (
565+ num_input_tokens + all_tokens_collected
566+ ) / generation_time
567+ generated_tokens_s = all_tokens_collected / generation_time
568+
510569 logger .info (
511570 f"Request { vllm_generation_request .request_id } finished ({ finish_reason } ). "
512571 f"Total time: { total_request_time } s, "
513- f"Queue time: { request_output . metrics . time_in_queue } s , "
514- f"Generation+async time: { generation_time } s , "
572+ f"Queue time: { queue_time } , "
573+ f"Generation+async time: { generation_time_str } , "
515574 f"Input tokens: { num_input_tokens } , "
516575 f"Generated tokens: { all_tokens_collected } , "
517- f"tokens/s: { ( num_input_tokens + all_tokens_collected ) / generation_time } , "
518- f"generated tokens/s: { all_tokens_collected / generation_time } ."
576+ f"tokens/s: { tokens_s } , "
577+ f"generated tokens/s: { generated_tokens_s } ."
519578 )
520579 else :
521580 logger .warning (
0 commit comments