Skip to content

Commit a43a3f1

Browse files
authored
[Bugfix][DP] DP distribution does not require ray[default] (#23822)
Signed-off-by: Kebe <mail@kebe7jun.com>
1 parent 6adaed4 commit a43a3f1

File tree

1 file changed

+10
-14
lines changed

1 file changed

+10
-14
lines changed

vllm/v1/engine/utils.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,6 @@ def create_dp_placement_groups(
315315

316316
import ray
317317
from ray._private.state import available_resources_per_node
318-
from ray.util.state import list_nodes
319318

320319
logger.info("Creating placement groups for data parallel")
321320
dp_master_ip = \
@@ -324,31 +323,28 @@ def create_dp_placement_groups(
324323
local_engine_count = \
325324
vllm_config.parallel_config.data_parallel_size_local
326325

327-
nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
328-
key=lambda node: node.node_ip != dp_master_ip)
329-
assert nodes[0].node_ip == dp_master_ip, (
330-
"The head node is missing or dead")
331-
assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
332-
"There can only be one head node")
333-
334326
available_resources = available_resources_per_node()
335327
world_size = vllm_config.parallel_config.world_size
336328
placement_groups: list[PlacementGroup] = []
337329
local_dp_ranks: list[int] = []
338-
339-
for node in nodes:
340-
node_ip = node.node_ip
341-
node_resources = available_resources[node.node_id]
330+
dp_master_ip_key = f'node:{dp_master_ip}'
331+
nodes = sorted(available_resources.values(),
332+
key=lambda x: dp_master_ip_key not in x)
333+
assert len(nodes) > 0, (
334+
"No nodes with resources found in Ray cluster.")
335+
assert dp_master_ip_key in nodes[0], (
336+
"The DP master node (ip: %s) is missing or dead", dp_master_ip)
337+
for node_resources in nodes:
342338
if "GPU" not in node_resources:
343339
continue
344340
# For now, each DP rank can only be assigned to one node
345341
# TODO(rui): support allocating a single DP rank
346342
# to multiple nodes
347343
available_engine_count = int(node_resources["GPU"]) // world_size
348-
if node_ip == dp_master_ip:
344+
if dp_master_ip_key in node_resources:
349345
assert available_engine_count >= local_engine_count, (
350346
"Not enough resources to allocate DP ranks "
351-
f"on DP master node {node_ip}")
347+
f"on DP master node {dp_master_ip}")
352348
for i in range(local_engine_count):
353349
bundles = [{
354350
"GPU": 1.0,

0 commit comments

Comments
 (0)