Skip to content

Commit 14ee2bf

Browse files
committed
refactor(utils): extract NCCL helpers to dedicated module
Move NCCL library discovery functions to vllm/utils/nccl.py: - find_nccl_library: NCCL/RCCL library resolver - find_nccl_include_paths: NCCL header path resolver This reduces vllm/utils/__init__.py complexity and improves modularity for distributed computing utilities. Contributes to #26900 Signed-off-by: dongbo910220 <1275604947@qq.com>
1 parent 9fce7be commit 14ee2bf

File tree

2 files changed

+73
-85
lines changed

2 files changed

+73
-85
lines changed

vllm/utils/__init__.py

Lines changed: 9 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import multiprocessing
1212
import os
1313
import signal
14-
import subprocess
1514
import sys
1615
import tempfile
1716
import textwrap
@@ -48,6 +47,15 @@
4847
import vllm.envs as envs
4948
from vllm.logger import enable_trace_function_call, init_logger
5049
from vllm.ray.lazy_utils import is_in_ray_actor
50+
from vllm.utils.nccl import (
51+
find_nccl_include_paths,
52+
find_nccl_library,
53+
)
54+
55+
_ = (
56+
find_nccl_include_paths,
57+
find_nccl_library,
58+
)
5159

5260
_DEPRECATED_MAPPINGS = {
5361
"cprofile": "profiling",
@@ -211,90 +219,6 @@ def init_cached_hf_modules() -> None:
211219
init_hf_modules()
212220

213221

214-
@cache
215-
def find_library(lib_name: str) -> str:
216-
"""
217-
Find the library file in the system.
218-
`lib_name` is full filename, with both prefix and suffix.
219-
This function resolves `lib_name` to the full path of the library.
220-
"""
221-
# Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
222-
# According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
223-
# `/sbin/ldconfig` should exist in all Linux systems.
224-
# `/sbin/ldconfig` searches the library in the system
225-
libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
226-
# each line looks like the following:
227-
# libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
228-
locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
229-
# `LD_LIBRARY_PATH` searches the library in the user-defined paths
230-
env_ld_library_path = envs.LD_LIBRARY_PATH
231-
if not locs and env_ld_library_path:
232-
locs = [
233-
os.path.join(dir, lib_name)
234-
for dir in env_ld_library_path.split(":")
235-
if os.path.exists(os.path.join(dir, lib_name))
236-
]
237-
if not locs:
238-
raise ValueError(f"Cannot find {lib_name} in the system.")
239-
return locs[0]
240-
241-
242-
def find_nccl_library() -> str:
243-
"""
244-
We either use the library file specified by the `VLLM_NCCL_SO_PATH`
245-
environment variable, or we find the library file brought by PyTorch.
246-
After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
247-
found by `ctypes` automatically.
248-
"""
249-
so_file = envs.VLLM_NCCL_SO_PATH
250-
251-
# manually load the nccl library
252-
if so_file:
253-
logger.info(
254-
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
255-
)
256-
else:
257-
if torch.version.cuda is not None:
258-
so_file = "libnccl.so.2"
259-
elif torch.version.hip is not None:
260-
so_file = "librccl.so.1"
261-
else:
262-
raise ValueError("NCCL only supports CUDA and ROCm backends.")
263-
logger.debug_once("Found nccl from library %s", so_file)
264-
return so_file
265-
266-
267-
def find_nccl_include_paths() -> list[str] | None:
268-
"""
269-
We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH`
270-
environment variable, or we find the library file brought by
271-
nvidia-nccl-cuXX. load_inline by default uses
272-
torch.utils.cpp_extension.include_paths
273-
"""
274-
paths: list[str] = []
275-
inc = envs.VLLM_NCCL_INCLUDE_PATH
276-
if inc and os.path.isdir(inc):
277-
paths.append(inc)
278-
279-
try:
280-
spec = importlib.util.find_spec("nvidia.nccl")
281-
if spec and getattr(spec, "submodule_search_locations", None):
282-
for loc in spec.submodule_search_locations:
283-
inc_dir = os.path.join(loc, "include")
284-
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
285-
paths.append(inc_dir)
286-
except Exception:
287-
pass
288-
289-
seen = set()
290-
out: list[str] = []
291-
for p in paths:
292-
if p and p not in seen:
293-
out.append(p)
294-
seen.add(p)
295-
return out or None
296-
297-
298222
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
299223
"""Set up function tracing for the current thread,
300224
if enabled via the VLLM_TRACE_FUNCTION environment variable

vllm/utils/nccl.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
from __future__ import annotations
5+
6+
import importlib
7+
import os
8+
9+
import torch
10+
11+
import vllm.envs as envs
12+
from vllm.logger import init_logger
13+
14+
logger = init_logger(__name__)
15+
16+
17+
def find_nccl_library() -> str:
18+
"""Return NCCL/RCCL shared library name to load.
19+
20+
Uses `VLLM_NCCL_SO_PATH` if set; otherwise chooses by torch backend.
21+
"""
22+
so_file = envs.VLLM_NCCL_SO_PATH
23+
if so_file:
24+
logger.info(
25+
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
26+
)
27+
else:
28+
if torch.version.cuda is not None:
29+
so_file = "libnccl.so.2"
30+
elif torch.version.hip is not None:
31+
so_file = "librccl.so.1"
32+
else:
33+
raise ValueError("NCCL only supports CUDA and ROCm backends.")
34+
logger.debug_once("Found nccl from library %s", so_file)
35+
return so_file
36+
37+
38+
def find_nccl_include_paths() -> list[str] | None:
39+
"""Return possible include paths containing `nccl.h`.
40+
41+
Considers `VLLM_NCCL_INCLUDE_PATH` and the `nvidia-nccl-cuXX` package.
42+
"""
43+
paths: list[str] = []
44+
inc = envs.VLLM_NCCL_INCLUDE_PATH
45+
if inc and os.path.isdir(inc):
46+
paths.append(inc)
47+
48+
try:
49+
spec = importlib.util.find_spec("nvidia.nccl")
50+
if spec and getattr(spec, "submodule_search_locations", None):
51+
for loc in spec.submodule_search_locations:
52+
inc_dir = os.path.join(loc, "include")
53+
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
54+
paths.append(inc_dir)
55+
except Exception as e:
56+
logger.debug("Failed to find nccl include path from nvidia.nccl package: %s", e)
57+
58+
seen: set[str] = set()
59+
out: list[str] = []
60+
for p in paths:
61+
if p and p not in seen:
62+
out.append(p)
63+
seen.add(p)
64+
return out or None

0 commit comments

Comments
 (0)