-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] Update code object metadata for kernarg preload #134666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Tracks the registers that explicit and hidden arguments are preloaded to with new code object metadata. IR arguments may be split across multiple parts by isel, and SGPR tuple alignment means that an argument may be spread across multiple registers. To support this, some of the utilities for hidden kernel arguments are moved to `AMDGPUArgumentUsageInfo.h`. Additional bookkeeping is also needed for tracking purposes.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-support Author: Austin Kerbow (kerbowa) ChangesTracks the registers that explicit and hidden arguments are preloaded to IR arguments may be split across multiple parts by isel, and SGPR tuple To support this, some of the utilities for hidden kernel arguments are Patch is 78.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134666.diff 21 Files Affected:
diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index 76ac7ab74a32e..d5e0f4031b0f6 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -47,7 +47,7 @@ constexpr uint32_t VersionMinorV5 = 2;
/// HSA metadata major version for code object V6.
constexpr uint32_t VersionMajorV6 = 1;
/// HSA metadata minor version for code object V6.
-constexpr uint32_t VersionMinorV6 = 2;
+constexpr uint32_t VersionMinorV6 = 3;
/// Old HSA metadata beginning assembler directive for V2. This is only used for
/// diagnostics now.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d158f0f58d711..06504a081e6f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -16,12 +16,15 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+using namespace llvm::KernArgPreload;
#define DEBUG_TYPE "amdgpu-argument-reg-usage-info"
INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE,
"Argument Register Usage Information Storage", false, true)
+constexpr HiddenArgInfo HiddenArgUtils::HiddenArgs[END_HIDDEN_ARGS];
+
void ArgDescriptor::print(raw_ostream &OS,
const TargetRegisterInfo *TRI) const {
if (!isSet()) {
@@ -176,6 +179,37 @@ AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
return AI;
}
+SmallVector<const KernArgPreloadDescriptor *, 4>
+AMDGPUFunctionArgInfo::getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const {
+ SmallVector<const KernArgPreloadDescriptor *, 4> Results;
+ for (const auto &KV : PreloadKernArgs) {
+ if (KV.second.OrigArgIdx == ArgIdx)
+ Results.push_back(&KV.second);
+ }
+
+ llvm::stable_sort(Results, [](const KernArgPreloadDescriptor *A,
+ const KernArgPreloadDescriptor *B) {
+ return A->PartIdx < B->PartIdx;
+ });
+
+ return Results;
+}
+
+std::optional<const KernArgPreloadDescriptor *>
+AMDGPUFunctionArgInfo::getHiddenArgPreloadDescriptor(HiddenArg HA) const {
+ assert(HA < END_HIDDEN_ARGS);
+
+ auto HiddenArgIt = PreloadHiddenArgsIndexMap.find(HA);
+ if (HiddenArgIt == PreloadHiddenArgsIndexMap.end())
+ return std::nullopt;
+
+ auto KernArgIt = PreloadKernArgs.find(HiddenArgIt->second);
+ if (KernArgIt == PreloadKernArgs.end())
+ return std::nullopt;
+
+ return &KernArgIt->second;
+}
+
const AMDGPUFunctionArgInfo &
AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
auto I = ArgInfoMap.find(&F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e07d47381ecca..ee4dba31f2617 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -11,7 +11,10 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Register.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
#include "llvm/Pass.h"
namespace llvm {
@@ -95,11 +98,78 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
return OS;
}
-struct KernArgPreloadDescriptor : public ArgDescriptor {
- KernArgPreloadDescriptor() {}
- SmallVector<MCRegister> Regs;
+namespace KernArgPreload {
+
+enum HiddenArg {
+ HIDDEN_BLOCK_COUNT_X,
+ HIDDEN_BLOCK_COUNT_Y,
+ HIDDEN_BLOCK_COUNT_Z,
+ HIDDEN_GROUP_SIZE_X,
+ HIDDEN_GROUP_SIZE_Y,
+ HIDDEN_GROUP_SIZE_Z,
+ HIDDEN_REMAINDER_X,
+ HIDDEN_REMAINDER_Y,
+ HIDDEN_REMAINDER_Z,
+ END_HIDDEN_ARGS
};
+// Stores information about a specific hidden argument.
+struct HiddenArgInfo {
+ // Offset in bytes from the location in the kernearg segment pointed to by
+ // the implicitarg pointer.
+ uint8_t Offset;
+ // The size of the hidden argument in bytes.
+ uint8_t Size;
+ // The name of the hidden argument in the kernel signature.
+ const char *Name;
+};
+
+struct HiddenArgUtils {
+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
+ {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
+ {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
+ {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
+ {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
+ {22, 2, "_hidden_remainder_z"}};
+
+ static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
+ for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
+ if (HiddenArgs[I].Offset == Offset)
+ return static_cast<HiddenArg>(I);
+
+ return END_HIDDEN_ARGS;
+ }
+
+ static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS)
+ return static_cast<Type *>(Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8));
+
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+
+ static const char *getHiddenArgName(HiddenArg HA) {
+ if (HA < END_HIDDEN_ARGS) {
+ return HiddenArgs[HA].Name;
+ }
+ llvm_unreachable("Unexpected hidden argument.");
+ }
+};
+
+struct KernArgPreloadDescriptor {
+ // Id of the original argument in the IR kernel function argument list.
+ unsigned OrigArgIdx = 0;
+
+ // If this IR argument was split into multiple parts, this is the index of the
+ // part in the original argument.
+ unsigned PartIdx = 0;
+
+ // The registers that the argument is preloaded into. The argument may be
+ // split accross multilpe registers.
+ SmallVector<MCRegister, 2> Regs;
+};
+
+} // namespace KernArgPreload
+
struct AMDGPUFunctionArgInfo {
// clang-format off
enum PreloadedValue {
@@ -161,7 +231,10 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor WorkItemIDZ;
// Map the index of preloaded kernel arguments to its descriptor.
- SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
+ SmallDenseMap<int, KernArgPreload::KernArgPreloadDescriptor>
+ PreloadKernArgs{};
+ // Map hidden argument to the index of it's descriptor.
+ SmallDenseMap<KernArgPreload::HiddenArg, int> PreloadHiddenArgsIndexMap{};
// The first user SGPR allocated for kernarg preloading.
Register FirstKernArgPreloadReg;
@@ -169,6 +242,16 @@ struct AMDGPUFunctionArgInfo {
getPreloadedValue(PreloadedValue Value) const;
static AMDGPUFunctionArgInfo fixedABILayout();
+
+ // Returns preload argument descriptors for an IR argument index. Isel may
+ // split IR arguments into multiple parts, the return vector holds all parts
+ // associated with an IR argument in the kernel signature.
+ SmallVector<const KernArgPreload::KernArgPreloadDescriptor *, 4>
+ getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const;
+
+ // Returns the hidden arguments `KernArgPreloadDescriptor` if it is preloaded.
+ std::optional<const KernArgPreload::KernArgPreloadDescriptor *>
+ getHiddenArgPreloadDescriptor(KernArgPreload::HiddenArg HA) const;
};
class AMDGPUArgumentUsageInfo : public ImmutablePass {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 2991778a1bbc7..f6f71b2d042d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
@@ -290,7 +291,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
if (Arg.hasAttribute("amdgpu-hidden-argument"))
continue;
- emitKernelArg(Arg, Offset, Args);
+ emitKernelArg(Arg, Offset, Args, MF);
}
emitHiddenKernelArgs(MF, Offset, Args);
@@ -300,7 +301,8 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
unsigned &Offset,
- msgpack::ArrayDocNode Args) {
+ msgpack::ArrayDocNode Args,
+ const MachineFunction &MF) {
const auto *Func = Arg.getParent();
auto ArgNo = Arg.getArgNo();
const MDNode *Node;
@@ -357,17 +359,18 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
Align ArgAlign;
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
- emitKernelArg(DL, ArgTy, ArgAlign,
- getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
- PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
- AccQual, TypeQual);
+ emitKernelArgImpl(DL, ArgTy, ArgAlign,
+ getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
+ "" /* PreloadRegisters */, PointeeAlign, Name, TypeName,
+ BaseTypeName, ActAccQual, AccQual, TypeQual);
}
-void MetadataStreamerMsgPackV4::emitKernelArg(
+void MetadataStreamerMsgPackV4::emitKernelArgImpl(
const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
- unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
- StringRef Name, StringRef TypeName, StringRef BaseTypeName,
- StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
+ unsigned &Offset, msgpack::ArrayDocNode Args, StringRef PreloadRegisters,
+ MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName,
+ StringRef BaseTypeName, StringRef ActAccQual, StringRef AccQual,
+ StringRef TypeQual) {
auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
@@ -409,6 +412,11 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
}
+ if (!PreloadRegisters.empty()) {
+ Arg[".preload_registers"] =
+ Arg.getDocument()->getNode(PreloadRegisters, /*Copy=*/true);
+ }
+
Args.push_back(Arg);
}
@@ -428,14 +436,14 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
if (HiddenArgNumBytes >= 8)
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
- Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+ Args);
if (HiddenArgNumBytes >= 16)
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
- Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+ Args);
if (HiddenArgNumBytes >= 24)
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
- Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+ Args);
auto *Int8PtrTy =
PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
@@ -445,42 +453,42 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
// before code object V5, which makes the mutual exclusion between the
// "printf buffer" and "hostcall buffer" here sound.
if (M->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+ Args);
else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer",
+ Offset, Args);
else
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
// Emit "default queue" and "completion action" arguments if enqueue kernel is
// used, otherwise emit dummy "none" arguments.
if (HiddenArgNumBytes >= 40) {
if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+ Args);
} else {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
if (HiddenArgNumBytes >= 48) {
if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
+ Offset, Args);
} else {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
// Emit the pointer argument for multi-grid object.
if (HiddenArgNumBytes >= 56) {
if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
+ Offset, Args);
} else {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
}
}
}
@@ -635,77 +643,83 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
auto *Int16Ty = Type::getInt16Ty(Func.getContext());
Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset,
+ Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset,
+ Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset,
+ Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
// Reserved for hidden_tool_correlation_id.
Offset += 8;
Offset += 8; // Reserved.
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
- emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
+ Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
+ Args);
+ emitKernelArgImpl(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
+ Args);
- emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
+ emitKernelArgImpl(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
Offset += 6; // Reserved.
auto *Int8PtrTy =
PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
if (M->getNamedMetadata("llvm.printf.fmts")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+ Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+ Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg",
+ Offset, Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
else
Offset += 8; // Skipped.
if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+ Args);
} else {
Offset += 8; // Skipped.
}
if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
- Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_completion_action",
+ Offset, Args);
} else {
Offset += 8; // Skipped.
}
// Emit argument for hidden dynamic lds size
if (MFI.isDynamicLDSUsed()) {
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
- Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
+ Args);
} else {
Offset += 4; // skipped
}
@@ -715,14 +729,17 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
// hidden_private_base and hidden_shared_base are only when the subtarget has
// ApertureRegs.
if (!ST.hasApertureRegs()) {
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
- emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_private_base", Offset,
+ Args);
+ emitKernelArgImpl(DL, Int32Ty, Align(4), "hidden_shared_base", Offset,
+ Args);
} else {
Offset += 8; // Skipped.
}
if (MFI.getUserSGPRInfo().hasQueuePtr())
- emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
+ emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset,
+ Args);
}
void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
@@ -745,5 +762,241 @@ void MetadataStreamerMsgPackV6::emitVersion() {
getRootMetadata("amdhsa.version") = Version;
}
+void MetadataStreamerMsgPackV6::emitHiddenKernelArgWithPreload(
+ const DataLayout &DL, Type *ArgTy, Align Alignment,
+ KernArgPreload::HiddenArg HiddenArg, StringRef ArgName, unsigned &Offset,
+ msgpack::ArrayDocNode Args, const AMDGPUFunctionArgInfo &ArgInfo) {
+
+ SmallString<16> PreloadStr;
+ auto PreloadDesc = ArgInfo.getHiddenArgPreloadDescriptor(HiddenArg);
+ if (PreloadDesc) {
+ const auto &Regs = (*PreloadDesc)->Regs;
+ for (unsigned I = 0; I < Regs.size(); ++I) {
+ if (I > 0)
+ PreloadStr += " ";
+ PreloadStr += AMDGPUInstPrinter::getRegisterName(Regs[I]);
+ }
+ }
+ emitKernelArgImpl(DL, ArgTy, Alignment, ArgName, Offset, Args, PreloadStr);
+}
+
+void MetadataStreamerMsgPackV6::emitHiddenKernelArgs(
+ const MachineFunction &MF, unsigne...
[truncated]
|
Results.push_back(&KV.second); | ||
} | ||
|
||
llvm::stable_sort(Results, [](const KernArgPreloadDescriptor *A, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm::
prefix is not necessary
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) | ||
if (HiddenArgs[I].Offset == Offset) | ||
return static_cast<HiddenArg>(I); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) | |
if (HiddenArgs[I].Offset == Offset) | |
return static_cast<HiddenArg>(I); | |
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I) { | |
if (HiddenArgs[I].Offset == Offset) | |
return static_cast<HiddenArg>(I); | |
} |
// There's no distinction between byval aggregates and raw aggregates. | ||
Type *ArgTy; | ||
Align ArgAlign; | ||
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); | |
auto [ArgTy, ArgAlign] = getArgumentTypeAlign(Arg, DL); |
if (M->getNamedMetadata("llvm.printf.fmts")) { | ||
emitKernelArgImpl(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, | ||
Args); | ||
} else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
small nit: don't use {}
for the elses in this function, they're all one line ?
} | ||
|
||
static const char *getHiddenArgName(HiddenArg HA) { | ||
if (HA < END_HIDDEN_ARGS) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
small nit: don't use {}
here
|
||
static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) { | ||
if (HA < END_HIDDEN_ARGS) | ||
return static_cast<Type *>(Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need the cast
if (HA < END_HIDDEN_ARGS) { | ||
return HiddenArgs[HA].Name; | ||
} | ||
llvm_unreachable("Unexpected hidden argument."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm_unreachable("Unexpected hidden argument."); | |
llvm_unreachable("unexpected hidden argument"); |
getPreloadDescriptorsForArgIdx(unsigned ArgIdx) const; | ||
|
||
// Returns the hidden arguments `KernArgPreloadDescriptor` if it is preloaded. | ||
std::optional<const KernArgPreload::KernArgPreloadDescriptor *> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid optional of pointer
Args); | ||
} | ||
|
||
void MetadataStreamerMsgPackV6::emitKernelArg(const Argument &Arg, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is mostly duplicated from the v5 version, do we really need another copy?
!2 = !{!"2:1:8:%g\5Cn"} | ||
|
||
attributes #0 = { optnone noinline } | ||
attributes #1 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing end of file newline
store i32 %add, ptr addrspace(1) %out, align 4 | ||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test a preloaded vector? Is inreg supposed on other aggregates?
Tracks the registers that explicit and hidden arguments are preloaded to
with new code object metadata.
IR arguments may be split across multiple parts by isel, and SGPR tuple
alignment means that an argument may be spread across multiple
registers.
To support this, some of the utilities for hidden kernel arguments are
moved to
AMDGPUArgumentUsageInfo.h
. Additional bookkeeping is alsoneeded for tracking purposes.