Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions vllm/distributed/device_communicators/cuda_communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from vllm.distributed.device_communicators.pynccl_allocator import (
is_symmetric_memory_enabled,
)
from vllm.distributed.parallel_state import is_global_first_rank
from vllm.logger import init_logger
from vllm.platforms import current_platform

Expand Down Expand Up @@ -96,35 +97,35 @@ def __init__(
from .all2all import NaiveAll2AllManager

self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
logger.info("Using naive all2all manager.")
elif all2all_backend == "allgather_reducescatter":
from .all2all import AgRsAll2AllManager

self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
logger.info("Using AllGather-ReduceScatter all2all manager.")
elif all2all_backend == "pplx":
from .all2all import PPLXAll2AllManager

self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
logger.info("Using PPLX all2all manager.")
elif all2all_backend == "deepep_high_throughput":
from .all2all import DeepEPHTAll2AllManager

self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
logger.info("Using DeepEP High-Throughput all2all manager.")
elif all2all_backend == "deepep_low_latency":
from .all2all import DeepEPLLAll2AllManager

self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
logger.info("Using DeepEP Low-Latency all2all manager.")
elif all2all_backend == "flashinfer_all2allv":
from .all2all import FlashInferAllToAllManager

self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
logger.info("Using Flashinfer all2allv manager.")
else:
raise ValueError(f"Unknown all2all backend: {all2all_backend}")

if is_global_first_rank():
logger.info(
"Using %s all2all manager.",
self.all2all_manager.__class__.__name__,
)

def all_reduce(self, input_):
# since currently we perform copy input -> symm_input -> out-of-place AR
# return symm_output, we don't need to check if input is symmetric
Expand Down
3 changes: 1 addition & 2 deletions vllm/distributed/device_communicators/pynccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,10 @@ def __init__(
self.disabled = False

self.nccl_version = self.nccl.ncclGetRawVersion()
logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())

if self.rank == 0:
# get the unique id from NCCL
self.unique_id = self.nccl.ncclGetUniqueId()
logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
else:
# construct an empty unique id
self.unique_id = ncclUniqueId()
Expand Down
2 changes: 1 addition & 1 deletion vllm/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,7 @@ def find_nccl_library() -> str:
so_file = "librccl.so.1"
else:
raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.info("Found nccl from library %s", so_file)
logger.debug_once("Found nccl from library %s", so_file)
return so_file


Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/attention/backends/mla/cutlass_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(
# FORCE_NUM_KV_SPLITS=1
force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
if force_num_kv_splits:
logger.warning_once("Forcing num_kv_splits to %d", int(force_num_kv_splits))
logger.debug_once("Forcing num_kv_splits to %d", int(force_num_kv_splits))
self._num_kv_splits = int(force_num_kv_splits)
else:
self._num_kv_splits = -1 # => Auto-detect
Expand Down
12 changes: 7 additions & 5 deletions vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from vllm.config import ParallelConfig, VllmConfig
from vllm.distributed import stateless_destroy_torch_distributed_process_group
from vllm.distributed.parallel_state import is_global_first_rank
from vllm.logger import init_logger
from vllm.logging_utils.dump_input import dump_engine_exception
from vllm.lora.request import LoRARequest
Expand Down Expand Up @@ -91,11 +92,12 @@ def __init__(
load_general_plugins()

self.vllm_config = vllm_config
logger.info(
"Initializing a V1 LLM engine (v%s) with config: %s",
VLLM_VERSION,
vllm_config,
)
if is_global_first_rank():
logger.info(
"Initializing a V1 LLM engine (v%s) with config: %s",
VLLM_VERSION,
vllm_config,
)

self.log_stats = log_stats

Expand Down
1 change: 0 additions & 1 deletion vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2888,7 +2888,6 @@ def load_model(self, eep_scale_up: bool = False) -> None:
with DeviceMemoryProfiler() as m:
time_before_load = time.perf_counter()
model_loader = get_model_loader(self.load_config)
logger.info("Loading model from scratch...")
self.model = model_loader.load_model(
vllm_config=self.vllm_config, model_config=self.model_config
)
Expand Down