Skip to content

Commit 96349e8

Browse files
authored
[llm] Bump vLLM version to support new models (#51726)
Signed-off-by: Linkun Chen <github@lkchen.net>
1 parent 506a468 commit 96349e8

24 files changed

+1492
-747
lines changed

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 68 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,50 @@ async def start(self):
305305
logger.info("Started vLLM engine.")
306306

307307
async def _start_engine(self) -> "EngineClient":
308+
from vllm import envs
309+
310+
# Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+
# 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+
# if any feature specified in the engine config is not supported, then
313+
# it falls back to v0. Note that launching vLLM on a non-main thread
314+
# is an experimental feature, so vLLM will fall back to v0 in this case.
315+
# 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+
# experimental features (such as launching vLLM on a non-main thread).
317+
# 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+
# In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used.
319+
if not envs.is_set("VLLM_USE_V1"):
320+
logger.warning(
321+
"VLLM_USE_V1 environment variable is not set, using vLLM v0 as default. "
322+
"Later we may switch default to use v1 once vLLM v1 is mature."
323+
)
324+
envs.set_vllm_use_v1(False)
325+
if not envs.VLLM_USE_V1:
326+
return await self._start_engine_v0()
327+
return await self._start_engine_v1()
328+
329+
async def _start_engine_v1(self) -> "EngineClient":
330+
"""Start the vLLM v1 engine. Note that we only use _get_async_engine_args
331+
to get the engine args and don't use _get_vllm_engine_config, because
332+
we integrate vLLM v1 using the highest-level async engine API.
333+
TODO: Refactor vLLM v0 integration to use the same async engine API
334+
to simplify the code.
335+
"""
336+
from vllm import AsyncLLMEngine
337+
338+
await self.initialize_node(self.llm_config)
339+
engine_args = _get_async_engine_args(self.llm_config)
340+
341+
return AsyncLLMEngine.from_engine_args(
342+
engine_args=engine_args,
343+
)
344+
345+
async def _start_engine_v0(self) -> "EngineClient":
308346
from vllm.engine.multiprocessing.client import MQLLMEngineClient
309347

310348
args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
311349
engine_args, engine_config = _get_vllm_engine_config(self.llm_config)
312350

313-
if MQLLMEngineClient.is_unsupported_config(engine_args):
351+
if MQLLMEngineClient.is_unsupported_config(engine_config):
314352
# If the engine is not supported, we fall back to the legacy async engine.
315353
#
316354
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -342,6 +380,11 @@ async def _start_mq_engine(
342380
placement_group=placement_group,
343381
placement_group_capture_child_tasks=True,
344382
),
383+
runtime_env=dict(
384+
env_vars=dict(
385+
VLLM_USE_V1="0",
386+
),
387+
),
345388
)(_EngineBackgroundProcess)
346389
# Run the process in the background
347390
process_ref = BackgroundCls.remote(ipc_path, engine_args, engine_config)
@@ -502,20 +545,36 @@ async def _generate(
502545
)
503546

504547
if request_output is not None:
505-
time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
506548
total_request_time = time.perf_counter() - start
507-
generation_time = (
508-
total_request_time - request_output.metrics.time_in_queue
509-
)
549+
if request_output.metrics is None:
550+
# vLLM V1 metrics are not included in the request output yet.
551+
queue_time = "N/A"
552+
generation_time_str = "N/A"
553+
tokens_s = "N/A"
554+
generated_tokens_s = "N/A"
555+
else:
556+
time_in_queue_histogram.observe(
557+
request_output.metrics.time_in_queue
558+
)
559+
queue_time = f"{request_output.metrics.time_in_queue}s"
560+
generation_time = (
561+
total_request_time - request_output.metrics.time_in_queue
562+
)
563+
generation_time_str = f"{generation_time}s"
564+
tokens_s = (
565+
num_input_tokens + all_tokens_collected
566+
) / generation_time
567+
generated_tokens_s = all_tokens_collected / generation_time
568+
510569
logger.info(
511570
f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
512571
f"Total time: {total_request_time}s, "
513-
f"Queue time: {request_output.metrics.time_in_queue}s, "
514-
f"Generation+async time: {generation_time}s, "
572+
f"Queue time: {queue_time}, "
573+
f"Generation+async time: {generation_time_str}, "
515574
f"Input tokens: {num_input_tokens}, "
516575
f"Generated tokens: {all_tokens_collected}, "
517-
f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
518-
f"generated tokens/s: {all_tokens_collected / generation_time}."
576+
f"tokens/s: {tokens_s}, "
577+
f"generated tokens/s: {generated_tokens_s}."
519578
)
520579
else:
521580
logger.warning(

python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ async def test_vllm_wrapper_semaphore(model_llama_3_2_216M):
171171
patch(
172172
"ray.llm._internal.batch.stages.vllm_engine_stage.vLLMEngineWrapper.generate_async_v0"
173173
) as mock_generate_async_v0,
174+
patch(
175+
"ray.llm._internal.batch.stages.vllm_engine_stage.vLLMEngineWrapper.generate_async_v1"
176+
) as mock_generate_async_v1,
174177
):
175178
mock_engine.from_engine_args.return_value = AsyncMock()
176179
num_running_requests = 0
@@ -207,6 +210,7 @@ async def mock_generate(request):
207210
)
208211

209212
mock_generate_async_v0.side_effect = mock_generate
213+
mock_generate_async_v1.side_effect = mock_generate
210214

211215
# Create wrapper with max 2 pending requests
212216
wrapper = vLLMEngineWrapper(
@@ -227,7 +231,10 @@ async def mock_generate(request):
227231
await asyncio.gather(*tasks)
228232

229233
# Verify all requests were processed
230-
assert mock_generate_async_v0.call_count == 10
234+
assert (
235+
mock_generate_async_v0.call_count == 10
236+
or mock_generate_async_v1.call_count == 10
237+
)
231238

232239

233240
@pytest.mark.asyncio

python/requirements/llm/llm-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Keep this in sync with the definition in setup.py for ray[llm]
2-
vllm>=0.7.2
2+
vllm>=0.8.2
33
# For json mode
44
jsonref>=1.1.0
55
jsonschema

python/requirements/test-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ boto3==1.26.76
1515
cloudpickle==2.2.0
1616
cryptography==42.0.5
1717
cython==0.29.37
18-
fastapi==0.109.2
18+
fastapi>=0.115.0
1919
feather-format==0.4.1
2020
# Keep compatible with Werkzeug
2121
flask==2.1.3

python/requirements_compiled.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ fairscale==0.4.6
485485
# via -r python/requirements/ml/tune-test-requirements.txt
486486
farama-notifications==0.0.4
487487
# via gymnasium
488-
fastapi==0.109.2
488+
fastapi==0.115.0
489489
# via
490490
# -r python/requirements.txt
491491
# -r python/requirements/test-requirements.txt
@@ -2140,7 +2140,7 @@ stack-data==0.6.3
21402140
# via ipython
21412141
stanio==0.3.0
21422142
# via cmdstanpy
2143-
starlette==0.36.3
2143+
starlette==0.37.2
21442144
# via
21452145
# -r python/requirements.txt
21462146
# fastapi

python/requirements_compiled_ray_py311_cpu.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cpu.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cpu.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cpu.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cpu.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_py311_cu121.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cu121.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cu121.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cu121.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cu121.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_py311_cu124.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cu124.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cu124.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cu124.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cu124.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_test_py311_cpu.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
601601
# via
602602
# -c /tmp/ray-deps/requirements_compiled.txt
603603
# gymnasium
604-
fastapi==0.109.2 \
605-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
606-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
604+
fastapi==0.115.0 \
605+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
606+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
607607
# via
608608
# -c /tmp/ray-deps/requirements_compiled.txt
609609
# -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
28662866
# via
28672867
# -c /tmp/ray-deps/requirements_compiled.txt
28682868
# ipython
2869-
starlette==0.36.3 \
2870-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
2871-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
2869+
starlette==0.37.2 \
2870+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
2871+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
28722872
# via
28732873
# -c /tmp/ray-deps/requirements_compiled.txt
28742874
# -r python/requirements.txt

python/requirements_compiled_ray_test_py311_cu121.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
601601
# via
602602
# -c /tmp/ray-deps/requirements_compiled.txt
603603
# gymnasium
604-
fastapi==0.109.2 \
605-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
606-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
604+
fastapi==0.115.0 \
605+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
606+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
607607
# via
608608
# -c /tmp/ray-deps/requirements_compiled.txt
609609
# -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
28662866
# via
28672867
# -c /tmp/ray-deps/requirements_compiled.txt
28682868
# ipython
2869-
starlette==0.36.3 \
2870-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
2871-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
2869+
starlette==0.37.2 \
2870+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
2871+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
28722872
# via
28732873
# -c /tmp/ray-deps/requirements_compiled.txt
28742874
# -r python/requirements.txt

0 commit comments

Comments
 (0)