@@ -355,6 +355,10 @@ class DeviceRTLTy {
355
355
// / devices.
356
356
std::vector<bool > InitializedFlags;
357
357
358
+ enum class PeerAccessState : uint8_t { Unkown, Yes, No };
359
+ std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
360
+ std::mutex PeerAccessMatrixLock;
361
+
358
362
// / A class responsible for interacting with device native runtime library to
359
363
// / allocate and free memory.
360
364
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
@@ -520,6 +524,9 @@ class DeviceRTLTy {
520
524
Modules.resize (NumberOfDevices);
521
525
StreamPool.resize (NumberOfDevices);
522
526
EventPool.resize (NumberOfDevices);
527
+ PeerAccessMatrix.resize (NumberOfDevices);
528
+ for (auto &V : PeerAccessMatrix)
529
+ V.resize (NumberOfDevices, PeerAccessState::Unkown);
523
530
524
531
// Get environment variables regarding teams
525
532
if (const char *EnvStr = getenv (" OMP_TEAM_LIMIT" )) {
@@ -1015,48 +1022,77 @@ class DeviceRTLTy {
1015
1022
}
1016
1023
1017
1024
int dataExchange (int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1018
- int64_t Size, __tgt_async_info *AsyncInfo) const {
1025
+ int64_t Size, __tgt_async_info *AsyncInfo) {
1019
1026
assert (AsyncInfo && " AsyncInfo is nullptr" );
1020
1027
1021
1028
CUresult Err;
1022
1029
CUstream Stream = getStream (SrcDevId, AsyncInfo);
1023
1030
1024
1031
// If they are two devices, we try peer to peer copy first
1025
1032
if (SrcDevId != DstDevId) {
1026
- int CanAccessPeer = 0 ;
1027
- Err = cuDeviceCanAccessPeer (&CanAccessPeer, SrcDevId, DstDevId);
1028
- if (Err != CUDA_SUCCESS) {
1029
- REPORT (" Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1030
- " , dst = %" PRId32 " \n " ,
1033
+ std::lock_guard<std::mutex> LG (PeerAccessMatrixLock);
1034
+
1035
+ switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1036
+ case PeerAccessState::No: {
1037
+ REPORT (" Peer access from %" PRId32 " to %" PRId32
1038
+ " is not supported. Fall back to D2D memcpy.\n " ,
1031
1039
SrcDevId, DstDevId);
1032
- CUDA_ERR_STRING (Err);
1033
1040
return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1034
1041
}
1042
+ case PeerAccessState::Unkown: {
1043
+ int CanAccessPeer = 0 ;
1044
+ Err = cuDeviceCanAccessPeer (&CanAccessPeer, SrcDevId, DstDevId);
1045
+ if (Err != CUDA_SUCCESS) {
1046
+ REPORT (" Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1047
+ " , dst = %" PRId32 " . Fall back to D2D memcpy.\n " ,
1048
+ SrcDevId, DstDevId);
1049
+ CUDA_ERR_STRING (Err);
1050
+ PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1051
+ return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1052
+ }
1035
1053
1036
- if (!CanAccessPeer) {
1037
- DP (" P2P memcpy not supported so fall back to D2D memcpy" );
1038
- return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1039
- }
1054
+ if (!CanAccessPeer) {
1055
+ REPORT (" P2P access from %d to %d is not supported. Fall back to D2D "
1056
+ " memcpy.\n " ,
1057
+ SrcDevId, DstDevId);
1058
+ PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059
+ return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1060
+ }
1040
1061
1041
- Err = cuCtxEnablePeerAccess (DeviceData[DstDevId].Context , 0 );
1042
- if (Err != CUDA_SUCCESS) {
1043
- REPORT (" Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1044
- " , dst = %" PRId32 " \n " ,
1045
- SrcDevId, DstDevId);
1062
+ Err = cuCtxEnablePeerAccess (DeviceData[DstDevId].Context , 0 );
1063
+ if (Err != CUDA_SUCCESS) {
1064
+ REPORT (" Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1065
+ " , dst = %" PRId32 " . Fall back to D2D memcpy.\n " ,
1066
+ SrcDevId, DstDevId);
1067
+ CUDA_ERR_STRING (Err);
1068
+ PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1069
+ return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1070
+ }
1071
+
1072
+ PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1073
+
1074
+ LLVM_FALLTHROUGH;
1075
+ }
1076
+ case PeerAccessState::Yes: {
1077
+ Err = cuMemcpyPeerAsync (
1078
+ (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context ,
1079
+ (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context , Size, Stream);
1080
+ if (Err == CUDA_SUCCESS)
1081
+ return OFFLOAD_SUCCESS;
1082
+
1083
+ DP (" Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1084
+ " , src_id =%" PRId32 " , dst_ptr = " DPxMOD " , dst_id =%" PRId32
1085
+ " . Fall back to D2D memcpy.\n " ,
1086
+ DPxPTR (SrcPtr), SrcDevId, DPxPTR (DstPtr), DstDevId);
1046
1087
CUDA_ERR_STRING (Err);
1088
+
1047
1089
return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
1048
1090
}
1049
-
1050
- Err = cuMemcpyPeerAsync ((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context ,
1051
- (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context ,
1052
- Size, Stream);
1053
- if (Err == CUDA_SUCCESS)
1054
- return OFFLOAD_SUCCESS;
1055
-
1056
- DP (" Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1057
- " , src_id =%" PRId32 " , dst_ptr = " DPxMOD " , dst_id =%" PRId32 " \n " ,
1058
- DPxPTR (SrcPtr), SrcDevId, DPxPTR (DstPtr), DstDevId);
1059
- CUDA_ERR_STRING (Err);
1091
+ default :
1092
+ REPORT (" Unknown PeerAccessState %d.\n " ,
1093
+ int (PeerAccessMatrix[SrcDevId][DstDevId]));
1094
+ return OFFLOAD_FAIL;
1095
+ }
1060
1096
}
1061
1097
1062
1098
return memcpyDtoD (SrcPtr, DstPtr, Size, Stream);
@@ -1598,8 +1634,10 @@ int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1598
1634
assert (DeviceRTL.isValidDeviceId (src_dev_id) && " src_dev_id is invalid" );
1599
1635
assert (DeviceRTL.isValidDeviceId (dst_dev_id) && " dst_dev_id is invalid" );
1600
1636
assert (AsyncInfo && " AsyncInfo is nullptr" );
1601
- // NOTE: We don't need to set context for data exchange as the device contexts
1602
- // are passed to CUDA function directly.
1637
+
1638
+ if (DeviceRTL.setContext (src_dev_id) != OFFLOAD_SUCCESS)
1639
+ return OFFLOAD_FAIL;
1640
+
1603
1641
return DeviceRTL.dataExchange (src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1604
1642
AsyncInfo);
1605
1643
}
0 commit comments