Skip to content

Commit a872286

Browse files
lk-cheneicherseiji
andauthored
[llm] bump vllm to 0.9.0.1 (#53443)
Signed-off-by: Linkun Chen <github@lkchen.net> Signed-off-by: Seiji Eicher <seiji@anyscale.com> Co-authored-by: Seiji Eicher <seiji@anyscale.com>
1 parent d96edaf commit a872286

18 files changed

+1149
-958
lines changed

python/ray/llm/_internal/batch/processor/vllm_engine_proc.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,9 @@ def build_vllm_engine_processor(
202202
model_path,
203203
trust_remote_code=config.engine_kwargs.get("trust_remote_code", False),
204204
)
205-
architecture = getattr(hf_config, "architectures", [DEFAULT_MODEL_ARCHITECTURE])[0]
205+
206+
architectures = getattr(hf_config, "architectures", [])
207+
architecture = architectures[0] if architectures else DEFAULT_MODEL_ARCHITECTURE
206208

207209
telemetry_agent = get_or_create_telemetry_agent()
208210
telemetry_agent.push_telemetry_report(

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,13 @@ def _clear_current_platform_cache():
126126
"""
127127
from vllm.platforms import current_platform
128128

129+
# TODO(seiji): remove this once https://github.com/vllm-project/vllm/pull/18979 is merged
130+
if (
131+
"CUDA_VISIBLE_DEVICES" in os.environ
132+
and os.environ["CUDA_VISIBLE_DEVICES"] == ""
133+
):
134+
del os.environ["CUDA_VISIBLE_DEVICES"]
135+
129136
# This check is just to future proof this implementation
130137
# in case vllm removes their lru_cache decorator
131138
if hasattr(current_platform.get_device_capability, "cache_clear"):
@@ -200,35 +207,23 @@ def __init__(
200207
"VLLM_NIXL_SIDE_CHANNEL_PORT" not in vllm.envs.environment_variables
201208
or "VLLM_NIXL_SIDE_CHANNEL_HOST" not in vllm.envs.environment_variables
202209
):
203-
logger.warning(
210+
raise ValueError(
204211
"This vLLM version does not support VLLM_NIXL_SIDE_CHANNEL_PORT"
205212
"or VLLM_NIXL_SIDE_CHANNEL_HOST environment variable. It's likely"
206213
"that you are using an older version of vLLM."
207214
)
208-
else:
209-
if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"):
210-
port: int = vllm.utils.get_open_port()
211-
os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port)
212-
if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"):
213-
os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm.utils.get_ip()
214-
215-
# We need to overwrite the engine_id to make it unique across replicas.
216-
# "engine_id" is added in vllm 0.9.0, so do existance check.
217-
try:
218-
engine_id = getattr(
219-
kv_transfer_config, "engine_id", str(uuid.uuid4())
220-
)
221-
host = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_HOST
222-
port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT
223-
kv_transfer_config.engine_id = "-".join(
224-
[engine_id, host, str(port)]
225-
)
226-
except ValueError:
227-
# TODO(lk-chen): Raise error once vllm 0.9.0 is pinned to rayllm
228-
logger.warning(
229-
"engine_id is not supported in vllm < 0.9.0, NIXL-backed kv transfer "
230-
"is not supported."
231-
)
215+
216+
if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"):
217+
port: int = vllm.utils.get_open_port()
218+
os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port)
219+
if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"):
220+
os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm.utils.get_ip()
221+
222+
# We need to overwrite the engine_id to make it unique across replicas.
223+
engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4()))
224+
host = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_HOST
225+
port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT
226+
kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)])
232227

233228
assert isinstance(
234229
llm_config, LLMConfig

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_loggers.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,14 +427,16 @@ def _unregister_vllm_metrics(self) -> None:
427427
pass
428428

429429

430+
# TODO(seiji): remove this whole file once we bump to vLLM that includes
431+
# https://github.com/vllm-project/vllm/pull/19113
430432
class PrometheusStatLogger(StatLoggerBase):
431433
_metrics_cls = Metrics
432434

433435
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
434436
self.metrics = self._metrics_cls(
435437
vllm_config=vllm_config, engine_index=engine_index
436438
)
437-
439+
self.vllm_config = vllm_config
438440
#
439441
# Cache config info metric
440442
#
@@ -452,7 +454,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
452454
# Info type metrics are syntactic sugar for a gauge permanently set to 1
453455
# Since prometheus multiprocessing mode does not support Info, emulate
454456
# info here with a gauge.
455-
info_gauge = prometheus_client.Gauge(
457+
info_gauge = self._metrics_cls._gauge_cls(
456458
name=name, documentation=documentation, labelnames=metrics_info.keys()
457459
).labels(**metrics_info)
458460
info_gauge.set(1)
@@ -542,6 +544,9 @@ def record(
542544
**lora_info_labels
543545
).set_to_current_time()
544546

547+
def log_engine_initialized(self):
548+
self.log_metrics_info("cache_config", self.vllm_config.cache_config)
549+
545550

546551
class RayPrometheusStatLogger(PrometheusStatLogger):
547552
"""RayPrometheusStatLogger uses Ray metrics instead."""

python/ray/llm/tests/serve/mocks/mock_vllm_engine.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from PIL import Image
88
from transformers import AutoTokenizer
99
from vllm import CompletionOutput, PromptType, RequestOutput
10-
from vllm.config import KVTransferConfig, ModelConfig, VllmConfig
10+
from vllm.config import DeviceConfig, KVTransferConfig, ModelConfig, VllmConfig
1111
from vllm.engine.protocol import EngineClient
1212
from vllm.sampling_params import SamplingParams as VLLMInternalSamplingParams
1313

@@ -488,10 +488,7 @@ async def generate_response():
488488
logprobs=None,
489489
)
490490
],
491-
# In vllm==0.8.5, RequestOutput does not accept kv_transfer_params
492-
# which will raise exception. see https://github.com/vllm-project/vllm/pull/18513
493-
# TODO(lk-chen): uncomment this once we bump vllm version in test env.
494-
# kv_transfer_params=kv_transfer_params,
491+
kv_transfer_params=kv_transfer_params,
495492
)
496493

497494
return generate_response()
@@ -579,6 +576,10 @@ async def add_lora(self, lora_request) -> None:
579576
"""Load a new LoRA adapter into the engine for future requests."""
580577
raise NotImplementedError("Not expected to be reached")
581578

579+
async def reset_mm_cache(self) -> None:
580+
"""Reset the multi-modal cache"""
581+
raise NotImplementedError("Not expected to be reached")
582+
582583

583584
class MockPDDisaggVLLMEngine(VLLMEngine):
584585
async def _start_engine(self) -> EngineClient:
@@ -592,7 +593,10 @@ async def _start_engine(self) -> EngineClient:
592593
trust_remote_code=False,
593594
dtype="auto",
594595
seed=0,
595-
)
596+
),
597+
device_config=DeviceConfig(
598+
device="cpu",
599+
),
596600
)
597601
)
598602

python/ray/util/metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def _validate_tags(self, final_tags):
138138
if tag_key not in final_tags:
139139
missing_tags.append(tag_key)
140140

141+
# Strict validation: if any required tag_keys are missing, raise error
141142
if missing_tags:
142143
raise ValueError(f"Missing value for tag key(s): {','.join(missing_tags)}.")
143144

python/requirements/llm/llm-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Keep this in sync with the definition in setup.py for ray[llm]
2-
vllm>=0.8.5
2+
vllm>=0.9.0.1
33
# For json mode
44
jsonref>=1.1.0
55
jsonschema

python/requirements/llm/llm-test-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ aiohttp
33
pillow
44
httpx>=0.27.2
55
pynvml>=12.0.0
6-
xgrammar==0.1.18
6+
xgrammar==0.1.19
77
jupytext>1.13.6
88
sphinx==6.2.1
99
backoff

0 commit comments

Comments
 (0)