Skip to content

Commit ff6f893

Browse files
committed
[Offload][AMDGPU] Correctly handle variable implicit argument sizes
Summary: The size of the implicit argument struct can vary depending on optimizations, it is not always the size as listed by the full struct. Additionally, the implicit arguments are always aligned on a pointer boundary. This patch updates the handling to use the correctly aligned offset and only initialize the members if they are contained in the reported size. Additionally, we modify the `alloc` and `free` routines to allow `alloc(0)` and `free(nullptr)` as these are mandated by the C standard and allow us to easily handle cases where the user calls a kernel with no arguments.
1 parent ed71a4f commit ff6f893

File tree

2 files changed

+51
-21
lines changed

2 files changed

+51
-21
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
420420
assert(PtrStorage && "Invalid pointer storage");
421421

422422
*PtrStorage = MemoryManager->allocate(Size, nullptr);
423-
if (*PtrStorage == nullptr)
423+
if (Size && *PtrStorage == nullptr)
424424
return Plugin::error(ErrorCode::OUT_OF_RESOURCES,
425425
"failure to allocate from AMDGPU memory manager");
426426

@@ -429,7 +429,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
429429

430430
/// Release an allocation to be reused.
431431
Error deallocate(void *Ptr) {
432-
assert(Ptr && "Invalid pointer");
432+
if (!Ptr)
433+
return Plugin::success();
433434

434435
if (MemoryManager->free(Ptr))
435436
return Plugin::error(ErrorCode::UNKNOWN,
@@ -3365,7 +3366,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33653366
KernelLaunchParamsTy LaunchParams,
33663367
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
33673368
if (ArgsSize != LaunchParams.Size &&
3368-
ArgsSize != LaunchParams.Size + getImplicitArgsSize())
3369+
ArgsSize > LaunchParams.Size + getImplicitArgsSize())
33693370
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
33703371
"mismatch of kernel arguments size");
33713372

@@ -3401,23 +3402,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
34013402
if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
34023403
return Err;
34033404

3404-
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr;
3405-
if (ArgsSize == LaunchParams.Size + getImplicitArgsSize()) {
3406-
ImplArgs = reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3407-
utils::advancePtr(AllArgs, LaunchParams.Size));
3408-
3409-
// Set the COV5+ implicit arguments to the appropriate values.
3410-
std::memset(ImplArgs, 0, getImplicitArgsSize());
3411-
ImplArgs->BlockCountX = NumBlocks[0];
3412-
ImplArgs->BlockCountY = NumBlocks[1];
3413-
ImplArgs->BlockCountZ = NumBlocks[2];
3414-
ImplArgs->GroupSizeX = NumThreads[0];
3415-
ImplArgs->GroupSizeY = NumThreads[1];
3416-
ImplArgs->GroupSizeZ = NumThreads[2];
3417-
ImplArgs->GridDims = NumBlocks[2] * NumThreads[2] > 1
3418-
? 3
3419-
: 1 + (NumBlocks[1] * NumThreads[1] != 1);
3420-
ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem;
3405+
uint64_t ImplArgsOffset = utils::roundUp(
3406+
LaunchParams.Size, alignof(hsa_utils::AMDGPUImplicitArgsTy));
3407+
if (ArgsSize > ImplArgsOffset) {
3408+
hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3409+
reinterpret_cast<hsa_utils::AMDGPUImplicitArgsTy *>(
3410+
utils::advancePtr(AllArgs, ImplArgsOffset));
3411+
3412+
// Set the COV5+ implicit arguments to the appropriate values if present.
3413+
uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3414+
std::memset(ImplArgs, 0, ImplArgsSize);
3415+
3416+
using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3417+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3418+
NumBlocks[0]);
3419+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3420+
NumBlocks[1]);
3421+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3422+
NumBlocks[2]);
3423+
3424+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3425+
NumThreads[0]);
3426+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3427+
NumThreads[1]);
3428+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3429+
NumThreads[2]);
3430+
3431+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3432+
NumBlocks[2] * NumThreads[2] > 1
3433+
? 3
3434+
: 1 + (NumBlocks[1] * NumThreads[1] != 1));
3435+
3436+
hsa_utils::initImplArg(ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3437+
KernelArgs.DynCGroupMem);
34213438
}
34223439

34233440
// Push the kernel launch into the stream.

offload/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <cstdint>
1414

1515
#include "Shared/Debug.h"
16+
#include "Shared/Utils.h"
1617
#include "Utils/ELF.h"
1718

1819
#include "omptarget.h"
@@ -26,7 +27,7 @@ namespace plugin {
2627
namespace hsa_utils {
2728

2829
// The implicit arguments of COV5 AMDGPU kernels.
29-
struct AMDGPUImplicitArgsTy {
30+
struct alignas(alignof(void *)) AMDGPUImplicitArgsTy {
3031
uint32_t BlockCountX;
3132
uint32_t BlockCountY;
3233
uint32_t BlockCountZ;
@@ -60,6 +61,18 @@ inline Error readAMDGPUMetaDataFromImage(
6061
return Err;
6162
}
6263

64+
/// Initializes the HSA implicit argument if the struct size permits it. This is
65+
/// necessary because optimizations can modify the size of the struct if
66+
/// portions of it are unused.
67+
template <typename MemberTy, typename T>
68+
void initImplArg(AMDGPUImplicitArgsTy *Base,
69+
MemberTy AMDGPUImplicitArgsTy::*Member, size_t AvailableSize,
70+
T Value) {
71+
uint64_t Offset = utils::getPtrDiff(&(Base->*Member), Base);
72+
if (Offset + sizeof(MemberTy) <= AvailableSize)
73+
Base->*Member = static_cast<MemberTy>(Value);
74+
}
75+
6376
} // namespace hsa_utils
6477
} // namespace plugin
6578
} // namespace target

0 commit comments

Comments
 (0)