Skip to content

Commit 2b7203a

Browse files
committed
[Libomptarget] Deinitialize AMDGPU global state more intentionally
A previous patch made the destruction of the HSA plugin more deterministic. However, there were still other global values that are not handled this way. When attempting to call a destructor kernel, the device would have already been uninitialized and we could not find the appropriate kernel to call. This is because they were stored in global containers that had their destructors called already. Merges this global state into the rest of the info state by putting those global values inside of the global pointer already allocated and deallocated by the constructor and destructor. This should allow the AMDGPU plugin to correctly identify the destructors if we were to run them. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D131011
1 parent 9cf6511 commit 2b7203a

File tree

1 file changed

+36
-33
lines changed
  • openmp/libomptarget/plugins/amdgpu/src

1 file changed

+36
-33
lines changed

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

+36-33
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,6 @@ struct KernelArgPool {
210210
};
211211
pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
212212

213-
std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
214-
KernelArgPoolMap;
215-
216213
/// Use a single entity to encode a kernel and a set of flags
217214
struct KernelTy {
218215
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
@@ -224,7 +221,9 @@ struct KernelTy {
224221
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
225222
int32_t DeviceId, void *CallStackAddr, const char *Name,
226223
uint32_t KernargSegmentSize,
227-
hsa_amd_memory_pool_t &KernArgMemoryPool)
224+
hsa_amd_memory_pool_t &KernArgMemoryPool,
225+
std::unordered_map<std::string, std::unique_ptr<KernelArgPool>>
226+
&KernelArgPoolMap)
228227
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
229228
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
230229
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
@@ -238,10 +237,6 @@ struct KernelTy {
238237
}
239238
};
240239

241-
/// List that contains all the kernels.
242-
/// FIXME: we may need this to be per device and per library.
243-
std::list<KernelTy> KernelsList;
244-
245240
template <typename Callback> static hsa_status_t findAgents(Callback CB) {
246241

247242
hsa_status_t Err =
@@ -456,6 +451,12 @@ class RTLDeviceInfoTy : HSALifetime {
456451

457452
int NumberOfDevices = 0;
458453

454+
/// List that contains all the kernels.
455+
/// FIXME: we may need this to be per device and per library.
456+
std::list<KernelTy> KernelsList;
457+
std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
458+
KernelArgPoolMap;
459+
459460
// GPU devices
460461
std::vector<hsa_agent_t> HSAAgents;
461462
std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
@@ -857,7 +858,6 @@ class RTLDeviceInfoTy : HSALifetime {
857858
"Unexpected device id!");
858859
FuncGblEntries[DeviceId].emplace_back();
859860
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
860-
// KernelArgPoolMap.clear();
861861
E.Entries.clear();
862862
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
863863
}
@@ -1116,19 +1116,6 @@ pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
11161116
static RTLDeviceInfoTy *DeviceInfoState = nullptr;
11171117
static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; }
11181118

1119-
int32_t __tgt_rtl_init_plugin() {
1120-
DeviceInfoState = new RTLDeviceInfoTy;
1121-
return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
1122-
? OFFLOAD_SUCCESS
1123-
: OFFLOAD_FAIL;
1124-
}
1125-
1126-
int32_t __tgt_rtl_deinit_plugin() {
1127-
if (DeviceInfoState)
1128-
delete DeviceInfoState;
1129-
return OFFLOAD_SUCCESS;
1130-
}
1131-
11321119
namespace {
11331120

11341121
int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
@@ -1171,7 +1158,7 @@ int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
11711158
(long long unsigned)(Elf64_Addr)HstPtr,
11721159
(long long unsigned)(Elf64_Addr)TgtPtr);
11731160
Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1174-
DeviceId);
1161+
DeviceId);
11751162
if (Err != HSA_STATUS_SUCCESS) {
11761163
DP("Error when copying data from host to device. Pointers: "
11771164
"host = 0x%016lx, device = 0x%016lx, size = %lld\n",
@@ -1468,8 +1455,9 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
14681455
KernelArgPool *ArgPool = nullptr;
14691456
void *KernArg = nullptr;
14701457
{
1471-
auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1472-
if (It != KernelArgPoolMap.end()) {
1458+
auto It =
1459+
DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name));
1460+
if (It != DeviceInfo().KernelArgPoolMap.end()) {
14731461
ArgPool = (It->second).get();
14741462
}
14751463
}
@@ -2031,6 +2019,20 @@ bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
20312019
}
20322020

20332021
extern "C" {
2022+
2023+
int32_t __tgt_rtl_init_plugin() {
2024+
DeviceInfoState = new RTLDeviceInfoTy;
2025+
return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded)
2026+
? OFFLOAD_SUCCESS
2027+
: OFFLOAD_FAIL;
2028+
}
2029+
2030+
int32_t __tgt_rtl_deinit_plugin() {
2031+
if (DeviceInfoState)
2032+
delete DeviceInfoState;
2033+
return OFFLOAD_SUCCESS;
2034+
}
2035+
20342036
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
20352037
return elfMachineIdIsAmdgcn(Image);
20362038
}
@@ -2373,8 +2375,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
23732375
}
23742376

23752377
// write ptr to device memory so it can be used by later kernels
2376-
Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
2377-
DeviceId);
2378+
Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2379+
sizeof(void *), DeviceId);
23782380
if (Err != HSA_STATUS_SUCCESS) {
23792381
DP("memcpy install of state_ptr failed\n");
23802382
return NULL;
@@ -2437,8 +2439,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
24372439
// If unified memory is present any target link variables
24382440
// can access host addresses directly. There is no longer a
24392441
// need for device copies.
2440-
Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
2441-
sizeof(void *), DeviceId);
2442+
Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
2443+
DeviceId);
24422444
if (Err != HSA_STATUS_SUCCESS)
24432445
DP("Error when copying USM\n");
24442446
DP("Copy linked variable host address (" DPxMOD ")"
@@ -2598,11 +2600,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
25982600
}
25992601
check("Loading computation property", Err);
26002602

2601-
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2602-
CallStackAddr, E->name, KernargSegmentSize,
2603-
DeviceInfo().KernArgPool));
2603+
DeviceInfo().KernelsList.push_back(
2604+
KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name,
2605+
KernargSegmentSize, DeviceInfo().KernArgPool,
2606+
DeviceInfo().KernelArgPoolMap));
26042607
__tgt_offload_entry Entry = *E;
2605-
Entry.addr = (void *)&KernelsList.back();
2608+
Entry.addr = (void *)&DeviceInfo().KernelsList.back();
26062609
DeviceInfo().addOffloadEntry(DeviceId, Entry);
26072610
DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
26082611
}

0 commit comments

Comments
 (0)