diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 206afef2ae0da..5787515f57cdf 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -470,7 +470,7 @@ void ProcessGroupNCCL::WorkNCCL::handleException( LOG(ERROR) << exceptionMsg; C10_LOG_API_USAGE_ONCE("ProcessGroupNCCL.WorkNCCL.handleException"); - if (errorHandling == TearDown) { + if (SHOULD_TEAR_DOWN(errorHandling)) { auto tearDownMsg = c10::str( "To avoid data inconsistency, we are taking the entire process down."); LOG(ERROR) << tearDownMsg; @@ -884,11 +884,13 @@ void ProcessGroupNCCL::workCleanupLoop() { // If work hits an exception (either an error or timeout) if (work.exception()) { - // Abort work and corresponding communicators - work.abort(); - // PG level abort, which would abort all other communicators on this - // rank - abort(); + if (SHOULD_CLEAN_UP(asyncErrorHandling_)) { + // Abort work and corresponding communicators + work.abort(); + // PG level abort, which would abort all other communicators on this + // rank + abort(); + } // Report desync state in case of timeout if (desyncDebug_ && timedOut) { try { diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 0dbcecc242885..c103a8693b975 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -44,9 +44,17 @@ constexpr const char* NCCL_DESYNC_DEBUG = "NCCL_DESYNC_DEBUG"; constexpr const char* NCCL_BACKEND_NAME = "nccl"; -// TearDown mode: tear down process upon error, see `WorkNCCL::handleException` -// Soft mode: just clean up collectives and abort communicators without tearing down process -enum ErrorHandlingMode { NoHandling = 0, TearDown = 1, CleanUpOnly = 2 }; +// NoHandling: do not handle asynchronous NCCL errors +// TearDown: tear down process upon error, see `WorkNCCL::handleException` +// CleanUpOnly: just clean up collectives and abort communicators without tearing down process +// SkipCleanUp: (this is a temporary option and can be removed in future) tear +// down process without cleaning up NCCL communicators. This should be used as a +// last resort in case `ncclCommAbort` itself is hanging +enum ErrorHandlingMode { NoHandling = 0, TearDown = 1, CleanUpOnly = 2, SkipCleanUp = 3 }; + +#define SHOULD_CLEAN_UP(a) (a != NoHandling && a != SkipCleanUp) + +#define SHOULD_TEAR_DOWN(a) (a != NoHandling && a != CleanUpOnly) // If set, ProcessGroupNCCL doesn't use recordStream calls to ensure // caching allocator safety for tensors used on both user-facing and