Skip to content

Commit 2695e23

Browse files
committed
[OpenMP][CUDA] Fix the issue that P2P memcpy doesn't work
This patch fixes the issue that P2P memcpy doesn't work. The root cause is we didn't set current context when calling the API function. In addition, a matrix to track the states of each pair of devices is also added such that we only need to query and configure the device once. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D122764
1 parent fd26d86 commit 2695e23

File tree

1 file changed

+67
-29
lines changed
  • openmp/libomptarget/plugins/cuda/src

1 file changed

+67
-29
lines changed

openmp/libomptarget/plugins/cuda/src/rtl.cpp

Lines changed: 67 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,10 @@ class DeviceRTLTy {
355355
/// devices.
356356
std::vector<bool> InitializedFlags;
357357

358+
enum class PeerAccessState : uint8_t { Unkown, Yes, No };
359+
std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
360+
std::mutex PeerAccessMatrixLock;
361+
358362
/// A class responsible for interacting with device native runtime library to
359363
/// allocate and free memory.
360364
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
@@ -520,6 +524,9 @@ class DeviceRTLTy {
520524
Modules.resize(NumberOfDevices);
521525
StreamPool.resize(NumberOfDevices);
522526
EventPool.resize(NumberOfDevices);
527+
PeerAccessMatrix.resize(NumberOfDevices);
528+
for (auto &V : PeerAccessMatrix)
529+
V.resize(NumberOfDevices, PeerAccessState::Unkown);
523530

524531
// Get environment variables regarding teams
525532
if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
@@ -1015,48 +1022,77 @@ class DeviceRTLTy {
10151022
}
10161023

10171024
int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1018-
int64_t Size, __tgt_async_info *AsyncInfo) const {
1025+
int64_t Size, __tgt_async_info *AsyncInfo) {
10191026
assert(AsyncInfo && "AsyncInfo is nullptr");
10201027

10211028
CUresult Err;
10221029
CUstream Stream = getStream(SrcDevId, AsyncInfo);
10231030

10241031
// If they are two devices, we try peer to peer copy first
10251032
if (SrcDevId != DstDevId) {
1026-
int CanAccessPeer = 0;
1027-
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1028-
if (Err != CUDA_SUCCESS) {
1029-
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1030-
", dst = %" PRId32 "\n",
1033+
std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1034+
1035+
switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1036+
case PeerAccessState::No: {
1037+
REPORT("Peer access from %" PRId32 " to %" PRId32
1038+
" is not supported. Fall back to D2D memcpy.\n",
10311039
SrcDevId, DstDevId);
1032-
CUDA_ERR_STRING(Err);
10331040
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
10341041
}
1042+
case PeerAccessState::Unkown: {
1043+
int CanAccessPeer = 0;
1044+
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1045+
if (Err != CUDA_SUCCESS) {
1046+
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1047+
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1048+
SrcDevId, DstDevId);
1049+
CUDA_ERR_STRING(Err);
1050+
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1051+
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1052+
}
10351053

1036-
if (!CanAccessPeer) {
1037-
DP("P2P memcpy not supported so fall back to D2D memcpy");
1038-
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1039-
}
1054+
if (!CanAccessPeer) {
1055+
REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1056+
"memcpy.\n",
1057+
SrcDevId, DstDevId);
1058+
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059+
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060+
}
10401061

1041-
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1042-
if (Err != CUDA_SUCCESS) {
1043-
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1044-
", dst = %" PRId32 "\n",
1045-
SrcDevId, DstDevId);
1062+
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1063+
if (Err != CUDA_SUCCESS) {
1064+
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1065+
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1066+
SrcDevId, DstDevId);
1067+
CUDA_ERR_STRING(Err);
1068+
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1069+
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1070+
}
1071+
1072+
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1073+
1074+
LLVM_FALLTHROUGH;
1075+
}
1076+
case PeerAccessState::Yes: {
1077+
Err = cuMemcpyPeerAsync(
1078+
(CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1079+
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1080+
if (Err == CUDA_SUCCESS)
1081+
return OFFLOAD_SUCCESS;
1082+
1083+
DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1084+
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1085+
". Fall back to D2D memcpy.\n",
1086+
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
10461087
CUDA_ERR_STRING(Err);
1088+
10471089
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
10481090
}
1049-
1050-
Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1051-
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1052-
Size, Stream);
1053-
if (Err == CUDA_SUCCESS)
1054-
return OFFLOAD_SUCCESS;
1055-
1056-
DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1057-
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1058-
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1059-
CUDA_ERR_STRING(Err);
1091+
default:
1092+
REPORT("Unknown PeerAccessState %d.\n",
1093+
int(PeerAccessMatrix[SrcDevId][DstDevId]));
1094+
return OFFLOAD_FAIL;
1095+
}
10601096
}
10611097

10621098
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
@@ -1598,8 +1634,10 @@ int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
15981634
assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
15991635
assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
16001636
assert(AsyncInfo && "AsyncInfo is nullptr");
1601-
// NOTE: We don't need to set context for data exchange as the device contexts
1602-
// are passed to CUDA function directly.
1637+
1638+
if (DeviceRTL.setContext(src_dev_id) != OFFLOAD_SUCCESS)
1639+
return OFFLOAD_FAIL;
1640+
16031641
return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
16041642
AsyncInfo);
16051643
}

0 commit comments

Comments
 (0)