@@ -315,7 +315,6 @@ def create_dp_placement_groups(
315315
316316 import ray
317317 from ray ._private .state import available_resources_per_node
318- from ray .util .state import list_nodes
319318
320319 logger .info ("Creating placement groups for data parallel" )
321320 dp_master_ip = \
@@ -324,31 +323,28 @@ def create_dp_placement_groups(
324323 local_engine_count = \
325324 vllm_config .parallel_config .data_parallel_size_local
326325
327- nodes = sorted (list_nodes (filters = [("state" , "=" , "ALIVE" )]),
328- key = lambda node : node .node_ip != dp_master_ip )
329- assert nodes [0 ].node_ip == dp_master_ip , (
330- "The head node is missing or dead" )
331- assert len (nodes ) == 1 or nodes [1 ].node_ip != dp_master_ip , (
332- "There can only be one head node" )
333-
334326 available_resources = available_resources_per_node ()
335327 world_size = vllm_config .parallel_config .world_size
336328 placement_groups : list [PlacementGroup ] = []
337329 local_dp_ranks : list [int ] = []
338-
339- for node in nodes :
340- node_ip = node .node_ip
341- node_resources = available_resources [node .node_id ]
330+ dp_master_ip_key = f'node:{ dp_master_ip } '
331+ nodes = sorted (available_resources .values (),
332+ key = lambda x : dp_master_ip_key not in x )
333+ assert len (nodes ) > 0 , (
334+ "No nodes with resources found in Ray cluster." )
335+ assert dp_master_ip_key in nodes [0 ], (
336+ "The DP master node (ip: %s) is missing or dead" , dp_master_ip )
337+ for node_resources in nodes :
342338 if "GPU" not in node_resources :
343339 continue
344340 # For now, each DP rank can only be assigned to one node
345341 # TODO(rui): support allocating a single DP rank
346342 # to multiple nodes
347343 available_engine_count = int (node_resources ["GPU" ]) // world_size
348- if node_ip == dp_master_ip :
344+ if dp_master_ip_key in node_resources :
349345 assert available_engine_count >= local_engine_count , (
350346 "Not enough resources to allocate DP ranks "
351- f"on DP master node { node_ip } " )
347+ f"on DP master node { dp_master_ip } " )
352348 for i in range (local_engine_count ):
353349 bundles = [{
354350 "GPU" : 1.0 ,
0 commit comments