Skip to content

Commit

Permalink
[misc][distributed] use 127.0.0.1 for single-node (vllm-project#5619)
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored and jimpang committed Jul 8, 2024
1 parent 9ea2366 commit c21d8b8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
7 changes: 5 additions & 2 deletions vllm/executor/multiproc_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (cuda_device_count_stateless,
get_distributed_init_method, get_ip, get_open_port,
get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async)

logger = init_logger(__name__)
Expand All @@ -37,8 +37,11 @@ def _init_executor(self) -> None:
assert world_size <= cuda_device_count_stateless(), (
"please set tensor_parallel_size to less than max local gpu count")

# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
"127.0.0.1", get_open_port())

if world_size == 1:
self.workers = []
Expand Down
10 changes: 10 additions & 0 deletions vllm/executor/ray_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,16 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
self._run_workers("update_environment_variables",
all_args=all_args_to_update_environment_variables)

if len(node_gpus) == 1:
# in single node case, we don't need to get the IP address.
# the loopback address is sufficient
# NOTE: a node may have several IP addresses, one for each
# network interface. `get_ip()` might return any of them,
# while they might not work for communication inside the node
# if the network setup is complicated. Using the loopback address
# solves this issue, as it always works for communication inside
# the node.
driver_ip = "127.0.0.1"
distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port())

Expand Down

0 comments on commit c21d8b8

Please sign in to comment.