Remove completed_queue for NCCL, MCCL & HCCL

tanquer · meta-codesync[bot] · commit 5c22a0ae8153 · 2025-10-29T10:50:37.000-07:00
Summary:
Similar to D85455174, remove completed_queue for other CCLs.

Context: The original purpose was to prevent destruction outside the main thread of the work object. However now that we have a finalize method, we can rely on the contract that the user will hold onto the object until finalize is called.

Reviewed By: siyengar

Differential Revision: D85771546

fbshipit-source-id: f28e89b8c1fe769f6a144f9684ac4b39f4a44460
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.hpp b/comms/torchcomms/nccl/TorchCommNCCL.hpp
@@ -299,7 +299,7 @@ class TorchCommNCCL : public TorchCommBackend,
   void timeoutWatchdog() noexcept;
   void checkInitialized() const;
   void checkAndAbortIfTimedOutOrError();
-  void checkWorkQueue(bool isMainThread);
+  void checkWorkQueue();
   void enqueueWork(std::shared_ptr<TorchWorkNCCL> work, cudaStream_t stream);
   bool getGraphCaptureMode();
   cudaStream_t getOperationStream(bool async_op);
diff --git a/comms/torchcomms/nccl/TorchCommNCCLUtils.cpp b/comms/torchcomms/nccl/TorchCommNCCLUtils.cpp
@@ -175,8 +175,8 @@ TorchCommNCCL::RedOpRAII TorchCommNCCL::getNcclReduceOp(
   }
 }
 
-void TorchCommNCCL::checkWorkQueue(bool isMainThread) {
-  TorchWorkNCCL::WorkStatus status = workq_.garbageCollect(isMainThread);
+void TorchCommNCCL::checkWorkQueue() {
+  TorchWorkNCCL::WorkStatus status = workq_.garbageCollect();
 
   switch (status) {
     case TorchWorkNCCL::WorkStatus::TIMEDOUT:
@@ -210,7 +210,7 @@ void TorchCommNCCL::timeoutWatchdog() noexcept {
     }
 
     // Check work objects for completion or timeout
-    checkWorkQueue(false);
+    checkWorkQueue();
     if (comm_state_ != CommState::NORMAL &&
         options_.abort_process_on_timeout_or_error) {
       // Log the error and abort the process.  We cannot abort the NCCL
@@ -243,7 +243,7 @@ void TorchCommNCCL::checkAndAbortIfTimedOutOrError() {
   }
 
   // First, check work queue status
-  checkWorkQueue(true);
+  checkWorkQueue();
 
   if (comm_state_ == CommState::TIMEOUT) {
     abortNcclComm();
diff --git a/comms/torchcomms/nccl/TorchWorkNCCL.hpp b/comms/torchcomms/nccl/TorchWorkNCCL.hpp
@@ -91,15 +91,14 @@ class TorchWorkNCCLQueue {
   TorchWorkNCCLQueue() = default;
   ~TorchWorkNCCLQueue() = default;
 
-  TorchWorkNCCL::WorkStatus garbageCollect(bool isMainThread);
+  TorchWorkNCCL::WorkStatus garbageCollect();
   // Finalize function can only be called from the main thread
   TorchWorkNCCL::WorkStatus finalize();
   void enqueueWork(std::shared_ptr<TorchWorkNCCL> work, cudaStream_t stream);
 
  private:
   std::unordered_map<cudaStream_t, std::queue<std::shared_ptr<TorchWorkNCCL>>>
       stream_work_queues_;
-  std::vector<std::shared_ptr<TorchWorkNCCL>> completed_work_queue_;
   std::recursive_mutex work_queues_mutex_;
 };
 
diff --git a/comms/torchcomms/nccl/TorchWorkNCCLQueue.cpp b/comms/torchcomms/nccl/TorchWorkNCCLQueue.cpp
@@ -5,8 +5,7 @@
 namespace torch {
 namespace comms {
 
-TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::garbageCollect(
-    bool isMainThread) {
+TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::garbageCollect() {
   std::lock_guard<std::recursive_mutex> lock(work_queues_mutex_);
 
   TorchWorkNCCL::WorkStatus last_status = TorchWorkNCCL::WorkStatus::COMPLETED;
@@ -29,7 +28,6 @@ TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::garbageCollect(
       if (status == TorchWorkNCCL::WorkStatus::COMPLETED) {
         // Work is completed, remove it from the work queue
         work_queue.pop();
-        completed_work_queue_.push_back(work);
         // Continue to the next element in the queue
       } else if (
           status == TorchWorkNCCL::WorkStatus::TIMEDOUT ||
@@ -50,11 +48,6 @@ TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::garbageCollect(
     }
   }
 
-  if (isMainThread) {
-    // If we are the main thread, clear the completed work queues
-    completed_work_queue_.clear();
-  }
-
   return last_status;
 }
 
@@ -70,7 +63,7 @@ TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::finalize() {
   // empty
   TorchWorkNCCL::WorkStatus status = TorchWorkNCCL::WorkStatus::COMPLETED;
   while (!stream_work_queues_.empty()) {
-    status = garbageCollect(true);
+    status = garbageCollect();
     if (status == TorchWorkNCCL::WorkStatus::ERROR ||
         status == TorchWorkNCCL::WorkStatus::TIMEDOUT ||
         status == TorchWorkNCCL::WorkStatus::COMPLETED) {
@@ -83,7 +76,6 @@ TorchWorkNCCL::WorkStatus TorchWorkNCCLQueue::finalize() {
   // NOTE: finalize MUST return without holding references to any work object,
   // otherwise it may leak object and cause side effects.
   stream_work_queues_.clear();
-  completed_work_queue_.clear();
 
   return status;
 }

Original file line number	Diff line number	Diff line change
`@@ -175,8 +175,8 @@ TorchCommNCCL::RedOpRAII TorchCommNCCL::getNcclReduceOp(`
`175`	`175`	`}`
`176`	`176`	`}`
`177`	`177`
`178`		`-void TorchCommNCCL::checkWorkQueue(bool isMainThread) {`
`179`		`- TorchWorkNCCL::WorkStatus status = workq_.garbageCollect(isMainThread);`
	`178`	`+void TorchCommNCCL::checkWorkQueue() {`
	`179`	`+ TorchWorkNCCL::WorkStatus status = workq_.garbageCollect();`
`180`	`180`
`181`	`181`	`switch (status) {`
`182`	`182`	`case TorchWorkNCCL::WorkStatus::TIMEDOUT:`
`@@ -210,7 +210,7 @@ void TorchCommNCCL::timeoutWatchdog() noexcept {`
`210`	`210`	`}`
`211`	`211`
`212`	`212`	`// Check work objects for completion or timeout`
`213`		`- checkWorkQueue(false);`
	`213`	`+ checkWorkQueue();`
`214`	`214`	`if (comm_state_ != CommState::NORMAL &&`
`215`	`215`	`options_.abort_process_on_timeout_or_error) {`
`216`	`216`	`// Log the error and abort the process. We cannot abort the NCCL`
`@@ -243,7 +243,7 @@ void TorchCommNCCL::checkAndAbortIfTimedOutOrError() {`
`243`	`243`	`}`
`244`	`244`
`245`	`245`	`// First, check work queue status`
`246`		`- checkWorkQueue(true);`
	`246`	`+ checkWorkQueue();`
`247`	`247`
`248`	`248`	`if (comm_state_ == CommState::TIMEOUT) {`
`249`	`249`	`abortNcclComm();`