ray-project
diff --git a/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py‎
Lines changed: 68 additions & 9 deletions b/‎python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py‎
Lines changed: 68 additions & 9 deletions
diff --git a/‎python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py‎
Lines changed: 8 additions & 1 deletion b/‎python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎python/requirements/llm/llm-requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎python/requirements/llm/llm-requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/requirements/test-requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎python/requirements/test-requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/requirements_compiled.txt‎
Lines changed: 2 additions & 2 deletions b/‎python/requirements_compiled.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/requirements_compiled_ray_py311_cpu.txt‎
Lines changed: 6 additions & 6 deletions b/‎python/requirements_compiled_ray_py311_cpu.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/requirements_compiled_ray_py311_cu121.txt‎
Lines changed: 6 additions & 6 deletions b/‎python/requirements_compiled_ray_py311_cu121.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/requirements_compiled_ray_py311_cu124.txt‎
Lines changed: 6 additions & 6 deletions b/‎python/requirements_compiled_ray_py311_cu124.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/requirements_compiled_ray_test_py311_cpu.txt‎
Lines changed: 6 additions & 6 deletions b/‎python/requirements_compiled_ray_test_py311_cpu.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/requirements_compiled_ray_test_py311_cu121.txt‎
Lines changed: 6 additions & 6 deletions b/‎python/requirements_compiled_ray_test_py311_cu121.txt‎
Lines changed: 6 additions & 6 deletions
@@ -305,12 +305,50 @@ async def start(self):
         logger.info("Started vLLM engine.")
 
     async def _start_engine(self) -> "EngineClient":
+        from vllm import envs
+
+        # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
+        # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
+        #    if any feature specified in the engine config is not supported, then
+        #    it falls back to v0. Note that launching vLLM on a non-main thread
+        #    is an experimental feature, so vLLM will fall back to v0 in this case.
+        # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
+        #    experimental features (such as launching vLLM on a non-main thread).
+        # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
+        # In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used.
+        if not envs.is_set("VLLM_USE_V1"):
+            logger.warning(
+                "VLLM_USE_V1 environment variable is not set, using vLLM v0 as default. "
+                "Later we may switch default to use v1 once vLLM v1 is mature."
+            )
+            envs.set_vllm_use_v1(False)
+        if not envs.VLLM_USE_V1:
+            return await self._start_engine_v0()
+        return await self._start_engine_v1()
+
+    async def _start_engine_v1(self) -> "EngineClient":
+        """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
+        to get the engine args and don't use _get_vllm_engine_config, because
+        we integrate vLLM v1 using the highest-level async engine API.
+        TODO: Refactor vLLM v0 integration to use the same async engine API
+        to simplify the code.
+        """
+        from vllm import AsyncLLMEngine
+
+        await self.initialize_node(self.llm_config)
+        engine_args = _get_async_engine_args(self.llm_config)
+
+        return AsyncLLMEngine.from_engine_args(
+            engine_args=engine_args,
+        )
+
+    async def _start_engine_v0(self) -> "EngineClient":
         from vllm.engine.multiprocessing.client import MQLLMEngineClient
 
         args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
         engine_args, engine_config = _get_vllm_engine_config(self.llm_config)
 
-        if MQLLMEngineClient.is_unsupported_config(engine_args):
+        if MQLLMEngineClient.is_unsupported_config(engine_config):
             # If the engine is not supported, we fall back to the legacy async engine.
             #
             # Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -342,6 +380,11 @@ async def _start_mq_engine(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
             ),
+            runtime_env=dict(
+                env_vars=dict(
+                    VLLM_USE_V1="0",
+                ),
+            ),
         )(_EngineBackgroundProcess)
         # Run the process in the background
         process_ref = BackgroundCls.remote(ipc_path, engine_args, engine_config)
@@ -502,20 +545,36 @@ async def _generate(
                 )
 
             if request_output is not None:
-                time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
                 total_request_time = time.perf_counter() - start
-                generation_time = (
-                    total_request_time - request_output.metrics.time_in_queue
-                )
+                if request_output.metrics is None:
+                    # vLLM V1 metrics are not included in the request output yet.
+                    queue_time = "N/A"
+                    generation_time_str = "N/A"
+                    tokens_s = "N/A"
+                    generated_tokens_s = "N/A"
+                else:
+                    time_in_queue_histogram.observe(
+                        request_output.metrics.time_in_queue
+                    )
+                    queue_time = f"{request_output.metrics.time_in_queue}s"
+                    generation_time = (
+                        total_request_time - request_output.metrics.time_in_queue
+                    )
+                    generation_time_str = f"{generation_time}s"
+                    tokens_s = (
+                        num_input_tokens + all_tokens_collected
+                    ) / generation_time
+                    generated_tokens_s = all_tokens_collected / generation_time
+
                 logger.info(
                     f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
                     f"Total time: {total_request_time}s, "
-                    f"Queue time: {request_output.metrics.time_in_queue}s, "
-                    f"Generation+async time: {generation_time}s, "
+                    f"Queue time: {queue_time}, "
+                    f"Generation+async time: {generation_time_str}, "
                     f"Input tokens: {num_input_tokens}, "
                     f"Generated tokens: {all_tokens_collected}, "
-                    f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
-                    f"generated tokens/s: {all_tokens_collected / generation_time}."
+                    f"tokens/s: {tokens_s}, "
+                    f"generated tokens/s: {generated_tokens_s}."
                 )
             else:
                 logger.warning(
 
@@ -171,6 +171,9 @@ async def test_vllm_wrapper_semaphore(model_llama_3_2_216M):
         patch(
             "ray.llm._internal.batch.stages.vllm_engine_stage.vLLMEngineWrapper.generate_async_v0"
         ) as mock_generate_async_v0,
+        patch(
+            "ray.llm._internal.batch.stages.vllm_engine_stage.vLLMEngineWrapper.generate_async_v1"
+        ) as mock_generate_async_v1,
     ):
         mock_engine.from_engine_args.return_value = AsyncMock()
         num_running_requests = 0
@@ -207,6 +210,7 @@ async def mock_generate(request):
             )
 
         mock_generate_async_v0.side_effect = mock_generate
+        mock_generate_async_v1.side_effect = mock_generate
 
         # Create wrapper with max 2 pending requests
         wrapper = vLLMEngineWrapper(
@@ -227,7 +231,10 @@ async def mock_generate(request):
         await asyncio.gather(*tasks)
 
         # Verify all requests were processed
-        assert mock_generate_async_v0.call_count == 10
+        assert (
+            mock_generate_async_v0.call_count == 10
+            or mock_generate_async_v1.call_count == 10
+        )
 
 
 @pytest.mark.asyncio
 
@@ -1,5 +1,5 @@
 # Keep this in sync with the definition in setup.py for ray[llm]
-vllm>=0.7.2
+vllm>=0.8.2
 # For json mode
 jsonref>=1.1.0
 jsonschema
 
@@ -15,7 +15,7 @@ boto3==1.26.76
 cloudpickle==2.2.0
 cryptography==42.0.5
 cython==0.29.37
-fastapi==0.109.2
+fastapi>=0.115.0
 feather-format==0.4.1
 # Keep compatible with Werkzeug
 flask==2.1.3
 
@@ -485,7 +485,7 @@ fairscale==0.4.6
     # via -r python/requirements/ml/tune-test-requirements.txt
 farama-notifications==0.0.4
     # via gymnasium
-fastapi==0.109.2
+fastapi==0.115.0
     # via
     #   -r python/requirements.txt
     #   -r python/requirements/test-requirements.txt
@@ -2140,7 +2140,7 @@ stack-data==0.6.3
     # via ipython
 stanio==0.3.0
     # via cmdstanpy
-starlette==0.36.3
+starlette==0.37.2
     # via
     #   -r python/requirements.txt
     #   fastapi
 
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cpu.txt
     #   gymnasium
-fastapi==0.109.2 \
-    --hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
-    --hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
+fastapi==0.115.0 \
+    --hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
+    --hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
     # via
     #   -c python/requirements_compiled_ray_test_py311_cpu.txt
     #   -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cpu.txt
     #   anyio
-starlette==0.36.3 \
-    --hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
-    --hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
+starlette==0.37.2 \
+    --hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
+    --hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
     # via
     #   -c python/requirements_compiled_ray_test_py311_cpu.txt
     #   -r python/requirements.txt
 
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu121.txt
     #   gymnasium
-fastapi==0.109.2 \
-    --hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
-    --hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
+fastapi==0.115.0 \
+    --hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
+    --hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu121.txt
     #   -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu121.txt
     #   anyio
-starlette==0.36.3 \
-    --hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
-    --hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
+starlette==0.37.2 \
+    --hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
+    --hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu121.txt
     #   -r python/requirements.txt
 
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu124.txt
     #   gymnasium
-fastapi==0.109.2 \
-    --hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
-    --hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
+fastapi==0.115.0 \
+    --hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
+    --hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu124.txt
     #   -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu124.txt
     #   anyio
-starlette==0.36.3 \
-    --hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
-    --hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
+starlette==0.37.2 \
+    --hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
+    --hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
     # via
     #   -c python/requirements_compiled_ray_test_py311_cu124.txt
     #   -r python/requirements.txt
 
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   gymnasium
-fastapi==0.109.2 \
-    --hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
-    --hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
+fastapi==0.115.0 \
+    --hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
+    --hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   ipython
-starlette==0.36.3 \
-    --hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
-    --hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
+starlette==0.37.2 \
+    --hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
+    --hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   -r python/requirements.txt
 
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   gymnasium
-fastapi==0.109.2 \
-    --hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
-    --hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
+fastapi==0.115.0 \
+    --hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
+    --hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   ipython
-starlette==0.36.3 \
-    --hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
-    --hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
+starlette==0.37.2 \
+    --hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
+    --hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
     # via
     #   -c /tmp/ray-deps/requirements_compiled.txt
     #   -r python/requirements.txt