Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuda_device_count_stateless #5473

Merged
merged 12 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import contextlib
import gc
import os
import subprocess
import sys
from typing import Any, Dict, List, Optional, Tuple, TypeVar

import pytest
Expand All @@ -21,7 +19,7 @@
from vllm.multimodal import MultiModalData
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from vllm.utils import get_num_gpus_available_isolated, is_cpu

logger = init_logger(__name__)

Expand Down Expand Up @@ -537,15 +535,4 @@ def num_gpus_available():
"""Get number of GPUs without initializing the CUDA context
in current process."""

try:
out = subprocess.run([
sys.executable, "-c",
"import torch; print(torch.cuda.device_count())"
],
capture_output=True,
check=True,
text=True)
except subprocess.CalledProcessError as e:
logger.warning("Failed to get number of GPUs.", exc_info=e)
return 0
return int(out.stdout.strip())
mgoin marked this conversation as resolved.
Show resolved Hide resolved
return get_num_gpus_available_isolated()
6 changes: 3 additions & 3 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu
from vllm.utils import (get_cpu_memory, get_num_gpus_available_isolated,
is_cpu, is_hip, is_neuron, is_tpu)

if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
Expand Down Expand Up @@ -605,12 +606,11 @@ def __init__(
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
from torch.cuda import device_count

from vllm.executor import ray_utils
backend = "mp"
ray_found = ray_utils.ray is not None
if device_count() < self.world_size:
if get_num_gpus_available_isolated() < self.world_size:
if not ray_found:
raise ValueError("Unable to load Ray which is "
"required for multi-node inference")
Expand Down
3 changes: 2 additions & 1 deletion vllm/distributed/device_communicators/custom_all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from vllm.distributed.parallel_state import (
get_local_rank, get_tensor_model_parallel_cpu_group, is_in_the_same_node)
from vllm.logger import init_logger
from vllm.utils import get_num_gpus_available_isolated

try:
import pynvml
Expand Down Expand Up @@ -149,7 +150,7 @@ def __init__(self,
if cuda_visible_devices:
device_ids = list(map(int, cuda_visible_devices.split(",")))
else:
device_ids = list(range(torch.cuda.device_count()))
device_ids = list(range(get_num_gpus_available_isolated()))

physical_device_id = device_ids[device.index]
tensor = torch.tensor([physical_device_id],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import vllm.envs as envs
from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
from vllm.logger import init_logger
from vllm.utils import get_num_gpus_available_isolated

logger = init_logger(__name__)

Expand Down Expand Up @@ -153,7 +154,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:

is_distributed = dist.is_initialized()

num_dev = torch.cuda.device_count()
num_dev = get_num_gpus_available_isolated()
cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
if cuda_visible_devices is None:
cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
Expand Down
6 changes: 3 additions & 3 deletions vllm/executor/multiproc_gpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
ResultHandler, WorkerMonitor)
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
from vllm.utils import (get_distributed_init_method, get_ip,
get_num_gpus_available_isolated, get_open_port,
get_vllm_instance_id, make_async)

logger = init_logger(__name__)
Expand All @@ -33,8 +34,7 @@ def _init_executor(self) -> None:
# Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

from torch.cuda import device_count
assert world_size <= device_count(), (
assert world_size <= get_num_gpus_available_isolated(), (
"please set tensor_parallel_size to less than max local gpu count")

distributed_init_method = get_distributed_init_method(
Expand Down
35 changes: 35 additions & 0 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,3 +693,38 @@ def inner(*args, **kwargs):
return inner # type: ignore

return wrapper


@lru_cache(maxsize=5)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why maxsize here? maybe just infinite size cache?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think letting any cache become unbound is a good practice :)

def _get_num_gpus_available_isolated(
cuda_visible_devices: Optional[str] = None) -> int:
# Note: cuda_visible_devices is not used, but we keep it as an argument for
# LRU Cache purposes.

# Code below is based on
# https://github.com/pytorch/pytorch/blob/
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
# torch/cuda/__init__.py#L831C1-L831C17
import torch.cuda
import torch.version

if not torch.cuda._is_compiled():
return 0
# bypass _device_count_nvml() if rocm (not supported)
nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml()
r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
return r


def get_num_gpus_available_isolated() -> int:
Yard1 marked this conversation as resolved.
Show resolved Hide resolved
"""Get number of GPUs without caching the number of devices
in current process.

This should be used instead of torch.cuda.device_count()
unless CUDA_VISIBLE_DEVICES has already been set to the desired
value."""

# This can be removed and simply replaced with torch.cuda.get_device_count
# after https://github.com/pytorch/pytorch/pull/122815 is released.

return _get_num_gpus_available_isolated(envs.CUDA_VISIBLE_DEVICES)
Loading