ray-project
diff --git a/‎python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
Lines changed: 3 additions & 1 deletion b/‎python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py
Lines changed: 20 additions & 25 deletions b/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py
Lines changed: 20 additions & 25 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_loggers.py
Lines changed: 7 additions & 2 deletions b/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_loggers.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
Lines changed: 10 additions & 6 deletions b/‎python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎python/ray/util/metrics.py
Lines changed: 1 addition & 0 deletions b/‎python/ray/util/metrics.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/requirements/llm/llm-requirements.txt
Lines changed: 1 addition & 1 deletion b/‎python/requirements/llm/llm-requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/requirements/llm/llm-test-requirements.txt
Lines changed: 1 addition & 1 deletion b/‎python/requirements/llm/llm-test-requirements.txt
Lines changed: 1 addition & 1 deletion
@@ -202,7 +202,9 @@ def build_vllm_engine_processor(
         model_path,
         trust_remote_code=config.engine_kwargs.get("trust_remote_code", False),
     )
-    architecture = getattr(hf_config, "architectures", [DEFAULT_MODEL_ARCHITECTURE])[0]
+
+    architectures = getattr(hf_config, "architectures", [])
+    architecture = architectures[0] if architectures else DEFAULT_MODEL_ARCHITECTURE
 
     telemetry_agent = get_or_create_telemetry_agent()
     telemetry_agent.push_telemetry_report(
 
@@ -126,6 +126,13 @@ def _clear_current_platform_cache():
     """
     from vllm.platforms import current_platform
 
+    # TODO(seiji): remove this once https://github.com/vllm-project/vllm/pull/18979 is merged
+    if (
+        "CUDA_VISIBLE_DEVICES" in os.environ
+        and os.environ["CUDA_VISIBLE_DEVICES"] == ""
+    ):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+
     # This check is just to future proof this implementation
     # in case vllm removes their lru_cache decorator
     if hasattr(current_platform.get_device_capability, "cache_clear"):
@@ -200,35 +207,23 @@ def __init__(
                 "VLLM_NIXL_SIDE_CHANNEL_PORT" not in vllm.envs.environment_variables
                 or "VLLM_NIXL_SIDE_CHANNEL_HOST" not in vllm.envs.environment_variables
             ):
-                logger.warning(
+                raise ValueError(
                     "This vLLM version does not support VLLM_NIXL_SIDE_CHANNEL_PORT"
                     "or VLLM_NIXL_SIDE_CHANNEL_HOST environment variable. It's likely"
                     "that you are using an older version of vLLM."
                 )
-            else:
-                if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"):
-                    port: int = vllm.utils.get_open_port()
-                    os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port)
-                if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"):
-                    os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm.utils.get_ip()
-
-                # We need to overwrite the engine_id to make it unique across replicas.
-                # "engine_id" is added in vllm 0.9.0, so do existance check.
-                try:
-                    engine_id = getattr(
-                        kv_transfer_config, "engine_id", str(uuid.uuid4())
-                    )
-                    host = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_HOST
-                    port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT
-                    kv_transfer_config.engine_id = "-".join(
-                        [engine_id, host, str(port)]
-                    )
-                except ValueError:
-                    # TODO(lk-chen): Raise error once vllm 0.9.0 is pinned to rayllm
-                    logger.warning(
-                        "engine_id is not supported in vllm < 0.9.0, NIXL-backed kv transfer "
-                        "is not supported."
-                    )
+
+            if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_PORT"):
+                port: int = vllm.utils.get_open_port()
+                os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port)
+            if not vllm.envs.is_set("VLLM_NIXL_SIDE_CHANNEL_HOST"):
+                os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = vllm.utils.get_ip()
+
+            # We need to overwrite the engine_id to make it unique across replicas.
+            engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4()))
+            host = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_HOST
+            port = vllm.envs.VLLM_NIXL_SIDE_CHANNEL_PORT
+            kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)])
 
         assert isinstance(
             llm_config, LLMConfig
 
@@ -427,14 +427,16 @@ def _unregister_vllm_metrics(self) -> None:
         pass
 
 
+# TODO(seiji): remove this whole file once we bump to vLLM that includes
+# https://github.com/vllm-project/vllm/pull/19113
 class PrometheusStatLogger(StatLoggerBase):
     _metrics_cls = Metrics
 
     def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.metrics = self._metrics_cls(
             vllm_config=vllm_config, engine_index=engine_index
         )
-
+        self.vllm_config = vllm_config
         #
         # Cache config info metric
         #
@@ -452,7 +454,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         # Info type metrics are syntactic sugar for a gauge permanently set to 1
         # Since prometheus multiprocessing mode does not support Info, emulate
         # info here with a gauge.
-        info_gauge = prometheus_client.Gauge(
+        info_gauge = self._metrics_cls._gauge_cls(
             name=name, documentation=documentation, labelnames=metrics_info.keys()
         ).labels(**metrics_info)
         info_gauge.set(1)
@@ -542,6 +544,9 @@ def record(
                 **lora_info_labels
             ).set_to_current_time()
 
+    def log_engine_initialized(self):
+        self.log_metrics_info("cache_config", self.vllm_config.cache_config)
+
 
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""
 
@@ -7,7 +7,7 @@
 from PIL import Image
 from transformers import AutoTokenizer
 from vllm import CompletionOutput, PromptType, RequestOutput
-from vllm.config import KVTransferConfig, ModelConfig, VllmConfig
+from vllm.config import DeviceConfig, KVTransferConfig, ModelConfig, VllmConfig
 from vllm.engine.protocol import EngineClient
 from vllm.sampling_params import SamplingParams as VLLMInternalSamplingParams
 
@@ -488,10 +488,7 @@ async def generate_response():
                             logprobs=None,
                         )
                     ],
-                    # In vllm==0.8.5, RequestOutput does not accept kv_transfer_params
-                    # which will raise exception. see https://github.com/vllm-project/vllm/pull/18513
-                    # TODO(lk-chen): uncomment this once we bump vllm version in test env.
-                    # kv_transfer_params=kv_transfer_params,
+                    kv_transfer_params=kv_transfer_params,
                 )
 
         return generate_response()
@@ -579,6 +576,10 @@ async def add_lora(self, lora_request) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         raise NotImplementedError("Not expected to be reached")
 
+    async def reset_mm_cache(self) -> None:
+        """Reset the multi-modal cache"""
+        raise NotImplementedError("Not expected to be reached")
+
 
 class MockPDDisaggVLLMEngine(VLLMEngine):
     async def _start_engine(self) -> EngineClient:
@@ -592,7 +593,10 @@ async def _start_engine(self) -> EngineClient:
                     trust_remote_code=False,
                     dtype="auto",
                     seed=0,
-                )
+                ),
+                device_config=DeviceConfig(
+                    device="cpu",
+                ),
             )
         )
 
 
@@ -138,6 +138,7 @@ def _validate_tags(self, final_tags):
             if tag_key not in final_tags:
                 missing_tags.append(tag_key)
 
+        # Strict validation: if any required tag_keys are missing, raise error
         if missing_tags:
             raise ValueError(f"Missing value for tag key(s): {','.join(missing_tags)}.")
 
 
@@ -1,5 +1,5 @@
 # Keep this in sync with the definition in setup.py for ray[llm]
-vllm>=0.8.5
+vllm>=0.9.0.1
 # For json mode
 jsonref>=1.1.0
 jsonschema
 
@@ -3,7 +3,7 @@ aiohttp
 pillow
 httpx>=0.27.2
 pynvml>=12.0.0
-xgrammar==0.1.18
+xgrammar==0.1.19
 jupytext>1.13.6
 sphinx==6.2.1
 backoff