From 3711811b1d2956e83e626c72f0e1607f2dfbc8fb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 8 Feb 2024 09:58:03 -0800 Subject: [PATCH] Disable custom all reduce by default (#2808) --- vllm/config.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index c35b6302b2cfa..27c61d4d50439 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -388,16 +388,26 @@ def _verify_args(self) -> None: if self.pipeline_parallel_size > 1: raise NotImplementedError( "Pipeline parallelism is not supported yet.") - if is_hip(): + if not self.disable_custom_all_reduce and self.world_size > 1: + if is_hip(): + self.disable_custom_all_reduce = True + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported on AMD GPUs.") + elif self.pipeline_parallel_size > 1: + self.disable_custom_all_reduce = True + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported with pipeline parallelism.") + + # FIXME(woosuk): Fix the stability issues and re-enable the custom + # all-reduce kernel. + if not self.disable_custom_all_reduce and self.world_size > 1: self.disable_custom_all_reduce = True logger.info( - "Disabled the custom all-reduce kernel because it is not " - "supported on AMD GPUs.") - elif self.pipeline_parallel_size > 1: - self.disable_custom_all_reduce = True - logger.info( - "Disabled the custom all-reduce kernel because it is not " - "supported with pipeline parallelism.") + "Custom all-reduce kernels are temporarily disabled due to " + "stability issues. We will re-enable them once the issues are " + "resolved.") class SchedulerConfig: