@@ -130,45 +130,6 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130130 " Error in hsa_amd_agent_iterate_memory_pools: %s" );
131131}
132132
133- // / Dispatches an asynchronous memory copy.
134- // / Enables different SDMA engines for the dispatch in a round-robin fashion.
135- Error asyncMemCopy (bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136- const void *Src, hsa_agent_t SrcAgent, size_t Size,
137- uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138- hsa_signal_t CompletionSignal) {
139- if (UseMultipleSdmaEngines) {
140- hsa_status_t S =
141- hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, Size,
142- NumDepSignals, DepSignals, CompletionSignal);
143- return Plugin::check (S, " Error in hsa_amd_memory_async_copy: %s" );
144- }
145-
146- // This solution is probably not the best
147- #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148- HSA_AMD_INTERFACE_VERSION_MINOR >= 2 )
149- return Plugin::error (" Async copy on selected SDMA requires ROCm 5.7" );
150- #else
151- static std::atomic<int > SdmaEngine{1 };
152-
153- // This atomics solution is probably not the best, but should be sufficient
154- // for now.
155- // In a worst case scenario, in which threads read the same value, they will
156- // dispatch to the same SDMA engine. This may result in sub-optimal
157- // performance. However, I think the possibility to be fairly low.
158- int LocalSdmaEngine = SdmaEngine.load (std::memory_order_acquire);
159- // This call is only avail in ROCm >= 5.7
160- hsa_status_t S = hsa_amd_memory_async_copy_on_engine (
161- Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
162- CompletionSignal, (hsa_amd_sdma_engine_id_t )LocalSdmaEngine,
163- /* force_copy_on_sdma=*/ true );
164- // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165- LocalSdmaEngine = (LocalSdmaEngine << 1 ) % 7 ;
166- SdmaEngine.store (LocalSdmaEngine, std::memory_order_relaxed);
167-
168- return Plugin::check (S, " Error in hsa_amd_memory_async_copy_on_engine: %s" );
169- #endif
170- }
171-
172133} // namespace utils
173134
174135// / Utility class representing generic resource references to AMDGPU resources.
@@ -984,9 +945,6 @@ struct AMDGPUStreamTy {
984945 // / Timeout hint for HSA actively waiting for signal value to change
985946 const uint64_t StreamBusyWaitMicroseconds;
986947
987- // / Indicate to spread data transfers across all avilable SDMAs
988- bool UseMultipleSdmaEngines;
989-
990948 // / Return the current number of asychronous operations on the stream.
991949 uint32_t size () const { return NextSlot; }
992950
@@ -1212,15 +1170,15 @@ struct AMDGPUStreamTy {
12121170 InputSignal = nullptr ;
12131171
12141172 // Issue the async memory copy.
1173+ hsa_status_t Status;
12151174 if (InputSignal) {
12161175 hsa_signal_t InputSignalRaw = InputSignal->get ();
1217- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218- CopySize, 1 , &InputSignalRaw,
1219- OutputSignal->get ());
1220- }
1221-
1222- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223- CopySize, 0 , nullptr , OutputSignal->get ());
1176+ Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 1 ,
1177+ &InputSignalRaw, OutputSignal->get ());
1178+ } else
1179+ Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 0 ,
1180+ nullptr , OutputSignal->get ());
1181+ return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
12241182 }
12251183
12261184 // / Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1256,19 +1214,21 @@ struct AMDGPUStreamTy {
12561214
12571215 // Issue the first step: device to host transfer. Avoid defining the input
12581216 // dependency if already satisfied.
1217+ hsa_status_t Status;
12591218 if (InputSignal) {
12601219 hsa_signal_t InputSignalRaw = InputSignal->get ();
1261- if (auto Err = utils::asyncMemCopy (
1262- UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
1263- &InputSignalRaw, OutputSignals[0 ]->get ()))
1264- return Err;
1220+ Status =
1221+ hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1222+ &InputSignalRaw, OutputSignals[0 ]->get ());
12651223 } else {
1266- if (auto Err = utils::asyncMemCopy (UseMultipleSdmaEngines, Inter, Agent,
1267- Src, Agent, CopySize, 0 , nullptr ,
1268- OutputSignals[0 ]->get ()))
1269- return Err;
1224+ Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 0 ,
1225+ nullptr , OutputSignals[0 ]->get ());
12701226 }
12711227
1228+ if (auto Err =
1229+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
1230+ return Err;
1231+
12721232 // Consume another stream slot and compute dependencies.
12731233 std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
12741234 assert (InputSignal && " Invalid input signal" );
@@ -1282,7 +1242,7 @@ struct AMDGPUStreamTy {
12821242 std::atomic_thread_fence (std::memory_order_release);
12831243
12841244 // Issue the second step: host to host transfer.
1285- hsa_status_t Status = hsa_amd_signal_async_handler (
1245+ Status = hsa_amd_signal_async_handler (
12861246 InputSignal->get (), HSA_SIGNAL_CONDITION_EQ, 0 , asyncActionCallback,
12871247 (void *)&Slots[Curr]);
12881248
@@ -1358,14 +1318,16 @@ struct AMDGPUStreamTy {
13581318
13591319 // Issue the second step: host to device transfer. Avoid defining the input
13601320 // dependency if already satisfied.
1321+ hsa_status_t Status;
13611322 if (InputSignal && InputSignal->load ()) {
13621323 hsa_signal_t InputSignalRaw = InputSignal->get ();
1363- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1364- Agent, CopySize, 1 , &InputSignalRaw,
1365- OutputSignal->get ());
1366- }
1367- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368- CopySize, 0 , nullptr , OutputSignal->get ());
1324+ Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 1 ,
1325+ &InputSignalRaw, OutputSignal->get ());
1326+ } else
1327+ Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 0 ,
1328+ nullptr , OutputSignal->get ());
1329+
1330+ return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
13691331 }
13701332
13711333 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1391,15 +1353,17 @@ struct AMDGPUStreamTy {
13911353 // allocated by this runtime or the caller made the appropriate
13921354 // access calls.
13931355
1356+ hsa_status_t Status;
13941357 if (InputSignal && InputSignal->load ()) {
13951358 hsa_signal_t InputSignalRaw = InputSignal->get ();
1396- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397- SrcAgent, CopySize, 1 , &InputSignalRaw,
1398- OutputSignal->get ());
1399- }
1400- return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401- SrcAgent, CopySize, 0 , nullptr ,
1402- OutputSignal->get ());
1359+ Status =
1360+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1361+ &InputSignalRaw, OutputSignal->get ());
1362+ } else
1363+ Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1364+ 0 , nullptr , OutputSignal->get ());
1365+
1366+ return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
14031367 }
14041368
14051369 // / Synchronize with the stream. The current thread waits until all operations
@@ -1824,8 +1788,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
18241788 OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
18251789 64 ),
18261790 OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1827- OMPX_UseMultipleSdmaEngines (
1828- " LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false ),
18291791 AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
18301792 AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
18311793
@@ -2244,9 +2206,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22442206 if (auto Err = Signal.init ())
22452207 return Err;
22462208
2247- if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), TgtPtr,
2248- Agent, PinnedPtr, Agent, Size, 0 ,
2249- nullptr , Signal.get ()))
2209+ Status = hsa_amd_memory_async_copy (TgtPtr, Agent, PinnedPtr, Agent, Size,
2210+ 0 , nullptr , Signal.get ());
2211+ if (auto Err =
2212+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
22502213 return Err;
22512214
22522215 if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2304,9 +2267,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
23042267 if (auto Err = Signal.init ())
23052268 return Err;
23062269
2307- if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), PinnedPtr,
2308- Agent, TgtPtr, Agent, Size, 0 , nullptr ,
2309- Signal.get ()))
2270+ Status = hsa_amd_memory_async_copy (PinnedPtr, Agent, TgtPtr, Agent, Size,
2271+ 0 , nullptr , Signal.get ());
2272+ if (auto Err =
2273+ Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
23102274 return Err;
23112275
23122276 if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2669,8 +2633,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26692633 });
26702634 }
26712635
2672- bool useMultipleSdmaEngines () const { return OMPX_UseMultipleSdmaEngines; }
2673-
26742636private:
26752637 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
26762638 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2740,9 +2702,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27402702 // / are microseconds.
27412703 UInt32Envar OMPX_StreamBusyWait;
27422704
2743- // / Use ROCm 5.7 interface for multiple SDMA engines
2744- BoolEnvar OMPX_UseMultipleSdmaEngines;
2745-
27462705 // / Stream manager for AMDGPU streams.
27472706 AMDGPUStreamManagerTy AMDGPUStreamManager;
27482707
@@ -2844,8 +2803,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
28442803 SignalManager (Device.getSignalManager()), Device(Device),
28452804 // Initialize the std::deque with some empty positions.
28462805 Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
2847- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848- UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
2806+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
28492807
28502808// / Class implementing the AMDGPU-specific functionalities of the global
28512809// / handler.
0 commit comments