[OpenMP][CUDA] Fix the issue that P2P memcpy doesn't work

shiltian · shiltian · commit 2695e23ad96b · 2022-06-28T15:32:03.000-04:00
This patch fixes the issue that P2P memcpy doesn't work. The root cause is we didn't set current context when calling the API function. In addition, a matrix to track the states of each pair of devices is also added such that we only need to query and configure the device once. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D122764
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -355,6 +355,10 @@ class DeviceRTLTy {
   /// devices.
   std::vector<bool> InitializedFlags;
 
+  enum class PeerAccessState : uint8_t { Unkown, Yes, No };
+  std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
+  std::mutex PeerAccessMatrixLock;
+
   /// A class responsible for interacting with device native runtime library to
   /// allocate and free memory.
   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
@@ -520,6 +524,9 @@ class DeviceRTLTy {
     Modules.resize(NumberOfDevices);
     StreamPool.resize(NumberOfDevices);
     EventPool.resize(NumberOfDevices);
+    PeerAccessMatrix.resize(NumberOfDevices);
+    for (auto &V : PeerAccessMatrix)
+      V.resize(NumberOfDevices, PeerAccessState::Unkown);
 
     // Get environment variables regarding teams
     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
@@ -1015,48 +1022,77 @@ class DeviceRTLTy {
   }
 
   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
-                   int64_t Size, __tgt_async_info *AsyncInfo) const {
+                   int64_t Size, __tgt_async_info *AsyncInfo) {
     assert(AsyncInfo && "AsyncInfo is nullptr");
 
     CUresult Err;
     CUstream Stream = getStream(SrcDevId, AsyncInfo);
 
     // If they are two devices, we try peer to peer copy first
     if (SrcDevId != DstDevId) {
-      int CanAccessPeer = 0;
-      Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
-      if (Err != CUDA_SUCCESS) {
-        REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
-               ", dst = %" PRId32 "\n",
+      std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
+
+      switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
+      case PeerAccessState::No: {
+        REPORT("Peer access from %" PRId32 " to %" PRId32
+               " is not supported. Fall back to D2D memcpy.\n",
                SrcDevId, DstDevId);
-        CUDA_ERR_STRING(Err);
         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
       }
+      case PeerAccessState::Unkown: {
+        int CanAccessPeer = 0;
+        Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
+                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
+                 SrcDevId, DstDevId);
+          CUDA_ERR_STRING(Err);
+          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
+          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+        }
 
-      if (!CanAccessPeer) {
-        DP("P2P memcpy not supported so fall back to D2D memcpy");
-        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-      }
+        if (!CanAccessPeer) {
+          REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
+                 "memcpy.\n",
+                 SrcDevId, DstDevId);
+          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
+          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+        }
 
-      Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
-      if (Err != CUDA_SUCCESS) {
-        REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
-               ", dst = %" PRId32 "\n",
-               SrcDevId, DstDevId);
+        Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
+        if (Err != CUDA_SUCCESS) {
+          REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
+                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
+                 SrcDevId, DstDevId);
+          CUDA_ERR_STRING(Err);
+          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
+          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
+        }
+
+        PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
+
+        LLVM_FALLTHROUGH;
+      }
+      case PeerAccessState::Yes: {
+        Err = cuMemcpyPeerAsync(
+            (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
+            (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
+        if (Err == CUDA_SUCCESS)
+          return OFFLOAD_SUCCESS;
+
+        DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
+           ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
+           ". Fall back to D2D memcpy.\n",
+           DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
         CUDA_ERR_STRING(Err);
+
         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
       }
-
-      Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
-                              (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
-                              Size, Stream);
-      if (Err == CUDA_SUCCESS)
-        return OFFLOAD_SUCCESS;
-
-      DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
-         ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
-         DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
-      CUDA_ERR_STRING(Err);
+      default:
+        REPORT("Unknown PeerAccessState %d.\n",
+               int(PeerAccessMatrix[SrcDevId][DstDevId]));
+        return OFFLOAD_FAIL;
+      }
     }
 
     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
@@ -1598,8 +1634,10 @@ int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
   assert(AsyncInfo && "AsyncInfo is nullptr");
-  // NOTE: We don't need to set context for data exchange as the device contexts
-  // are passed to CUDA function directly.
+
+  if (DeviceRTL.setContext(src_dev_id) != OFFLOAD_SUCCESS)
+    return OFFLOAD_FAIL;
+
   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
                                 AsyncInfo);
 }