File tree Expand file tree Collapse file tree 2 files changed +0
-29
lines changed Expand file tree Collapse file tree 2 files changed +0
-29
lines changed Original file line number Diff line number Diff line change 56
56
# Need a PR to vllm to support get port from environment.
57
57
# Future Plan:
58
58
# Remove those patch when vllm merged them
59
- # 3. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group`
60
- # Why:
61
- # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
62
- # get better performance
63
- # How:
64
- # adopt nccl backend to init process group.(Now we still use gloo, it's just a placeholder, we'll use nccl in the future)
65
- # Related PR (if no, explain why):
66
- # Need a PR to vllm to support more backend.
67
- # Future Plan:
68
- # Remove those patch when vllm support more backend.
69
59
#
70
60
# * Worker Patch:
71
61
# ===============
Original file line number Diff line number Diff line change 23
23
import vllm .envs as envs
24
24
from torch .distributed import ProcessGroup
25
25
from vllm .config import ParallelConfig
26
- from vllm .distributed .utils import \
27
- stateless_init_torch_distributed_process_group
28
26
29
27
from vllm_ascend .utils import NullHandle , is_310p
30
28
@@ -65,25 +63,8 @@ def parallel_config_get_dp_port(self) -> int:
65
63
return port
66
64
67
65
68
- def stateless_init_dp_group (self ) -> "ProcessGroup" :
69
- # TODO(Yizhou): Currently we have to set the backend to gloo
70
- # because in vllm.config.ParallelConfig.has_unfinished_dp the
71
- # device is set to cpu. We need to fix this in the future.
72
- # We need to compare the performance of gloo and hccl and then
73
- # decide which one to use.
74
- dp_group = stateless_init_torch_distributed_process_group (
75
- self .data_parallel_master_ip ,
76
- self .get_next_dp_init_port (),
77
- self .data_parallel_rank ,
78
- self .data_parallel_size ,
79
- backend = "gloo" )
80
-
81
- return dp_group
82
-
83
-
84
66
vllm .distributed .parallel_state .destroy_model_parallel = ascend_destroy_model_parallel
85
67
ParallelConfig .get_next_dp_init_port = parallel_config_get_dp_port
86
- ParallelConfig .stateless_init_dp_group = stateless_init_dp_group
87
68
88
69
89
70
def communication_adaptation_310p ():
You can’t perform that action at this time.
0 commit comments