File tree Expand file tree Collapse file tree 1 file changed +18
-8
lines changed Expand file tree Collapse file tree 1 file changed +18
-8
lines changed Original file line number Diff line number Diff line change @@ -388,16 +388,26 @@ def _verify_args(self) -> None:
388
388
if self .pipeline_parallel_size > 1 :
389
389
raise NotImplementedError (
390
390
"Pipeline parallelism is not supported yet." )
391
- if is_hip ():
391
+ if not self .disable_custom_all_reduce and self .world_size > 1 :
392
+ if is_hip ():
393
+ self .disable_custom_all_reduce = True
394
+ logger .info (
395
+ "Disabled the custom all-reduce kernel because it is not "
396
+ "supported on AMD GPUs." )
397
+ elif self .pipeline_parallel_size > 1 :
398
+ self .disable_custom_all_reduce = True
399
+ logger .info (
400
+ "Disabled the custom all-reduce kernel because it is not "
401
+ "supported with pipeline parallelism." )
402
+
403
+ # FIXME(woosuk): Fix the stability issues and re-enable the custom
404
+ # all-reduce kernel.
405
+ if not self .disable_custom_all_reduce and self .world_size > 1 :
392
406
self .disable_custom_all_reduce = True
393
407
logger .info (
394
- "Disabled the custom all-reduce kernel because it is not "
395
- "supported on AMD GPUs." )
396
- elif self .pipeline_parallel_size > 1 :
397
- self .disable_custom_all_reduce = True
398
- logger .info (
399
- "Disabled the custom all-reduce kernel because it is not "
400
- "supported with pipeline parallelism." )
408
+ "Custom all-reduce kernels are temporarily disabled due to "
409
+ "stability issues. We will re-enable them once the issues are "
410
+ "resolved." )
401
411
402
412
403
413
class SchedulerConfig :
You can’t perform that action at this time.
0 commit comments