Skip to content

Commit 58508ba

Browse files
authored
[SYCL] Generalize local accessor to shared mem pass (#5149)
Now, that it lives in `SYCLLowerIR` it can be easily shared between AMDGCN and NVPTX backends. This requires the same alignment fix as for Cuda, see: #5113 Fixes #5013
1 parent 25acbd2 commit 58508ba

25 files changed

+393
-75
lines changed

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5790,6 +5790,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
57905790
CmdArgs.push_back("-treat-scalable-fixed-error-as-warning");
57915791
}
57925792

5793+
// Enable local accessor to shared memory pass for SYCL.
5794+
if (isa<BackendJobAction>(JA) && IsSYCL) {
5795+
CmdArgs.push_back("-mllvm");
5796+
CmdArgs.push_back("-sycl-enable-local-accessor");
5797+
}
57935798
// These two are potentially updated by AddClangCLArgs.
57945799
codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
57955800
bool EmitCodeView = false;

clang/lib/Driver/ToolChains/HIPAMD.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,12 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
7878
const llvm::opt::ArgList &Args) const {
7979
// Construct lld command.
8080
// The output from ld.lld is an HSA code object file.
81-
ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
82-
"-plugin-opt=-amdgpu-internalize-symbols"};
81+
ArgStringList LldArgs{"-flavor",
82+
"gnu",
83+
"--no-undefined",
84+
"-shared",
85+
"-plugin-opt=-amdgpu-internalize-symbols",
86+
"-plugin-opt=-sycl-enable-local-accessor"};
8387

8488
auto &TC = getToolChain();
8589
auto &D = TC.getDriver();
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
/// Check the correct handling of sycl-enable-local-accessor option.
2+
3+
// REQUIRES: clang-driver
4+
5+
// RUN: %clang -fsycl -### %s 2>&1 \
6+
// RUN: | FileCheck -check-prefix=CHECK-NO-OPT %s
7+
// CHECK-NO-OPT-NOT: "-sycl-enable-local-accessor"
8+
9+
// RUN: %clang -fsycl -fsycl-targets=nvptx64-nvidia-cuda -### %s 2>&1 \
10+
// RUN: | FileCheck %s
11+
// CHECK: "-sycl-enable-local-accessor"

llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.h renamed to llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
//
99
// This pass operates on SYCL kernels being compiled to CUDA. It modifies
1010
// kernel entry points which take pointers to shared memory and modifies them
11-
// to take offsets into shared memory (represented by a symbol in the shared address
12-
// space). The SYCL runtime is expected to provide offsets rather than pointers
13-
// to these functions.
11+
// to take offsets into shared memory (represented by a symbol in the shared
12+
// address space). The SYCL runtime is expected to provide offsets rather than
13+
// pointers to these functions.
1414
//
1515
//===----------------------------------------------------------------------===//
1616

llvm/lib/SYCLLowerIR/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ add_llvm_component_library(LLVMSYCLLowerIR
5252
LowerWGLocalMemory.cpp
5353
MutatePrintfAddrspace.cpp
5454

55+
LocalAccessorToSharedMemory.cpp
56+
5557
ADDITIONAL_HEADER_DIRS
5658
${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
5759
${LLVM_MAIN_SRC_DIR}/projects/vc-intrinsics/GenXIntrinsics/include

llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp renamed to llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp

Lines changed: 155 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -14,92 +14,115 @@
1414
//
1515
//===----------------------------------------------------------------------===//
1616

17-
#include "LocalAccessorToSharedMemory.h"
18-
#include "../MCTargetDesc/NVPTXBaseInfo.h"
17+
#include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h"
1918
#include "llvm/IR/GlobalValue.h"
2019
#include "llvm/IR/Instructions.h"
2120
#include "llvm/IR/PassManager.h"
21+
#include "llvm/Support/CommandLine.h"
2222
#include "llvm/Transforms/IPO.h"
2323

2424
using namespace llvm;
2525

2626
#define DEBUG_TYPE "localaccessortosharedmemory"
2727

28+
static bool EnableLocalAccessor;
29+
30+
static cl::opt<bool, true> EnableLocalAccessorFlag(
31+
"sycl-enable-local-accessor", cl::Hidden,
32+
cl::desc("Enable local accessor to shared memory optimisation."),
33+
cl::location(EnableLocalAccessor), cl::init(false));
34+
2835
namespace llvm {
2936
void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
30-
}
37+
} // namespace llvm
3138

3239
namespace {
3340

3441
class LocalAccessorToSharedMemory : public ModulePass {
42+
private:
43+
enum class ArchType { Cuda, AMDHSA, Unsupported };
44+
45+
struct KernelPayload {
46+
KernelPayload(Function *Kernel, MDNode *MD = nullptr)
47+
: Kernel(Kernel), MD(MD){};
48+
Function *Kernel;
49+
MDNode *MD;
50+
};
51+
52+
unsigned SharedASValue = 0;
53+
3554
public:
3655
static char ID;
3756
LocalAccessorToSharedMemory() : ModulePass(ID) {}
3857

3958
bool runOnModule(Module &M) override {
59+
if (!EnableLocalAccessor)
60+
return false;
61+
62+
auto AT = StringSwitch<ArchType>(M.getTargetTriple().c_str())
63+
.Case("nvptx64-nvidia-cuda", ArchType::Cuda)
64+
.Case("nvptx-nvidia-cuda", ArchType::Cuda)
65+
.Case("amdgcn-amd-amdhsa", ArchType::AMDHSA)
66+
.Default(ArchType::Unsupported);
67+
4068
// Invariant: This pass is only intended to operate on SYCL kernels being
41-
// compiled to the `nvptx{,64}-nvidia-cuda` triple.
42-
// TODO: make sure that non-SYCL kernels are not impacted.
69+
// compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
70+
// triples.
71+
if (ArchType::Unsupported == AT)
72+
return false;
73+
4374
if (skipModule(M))
4475
return false;
4576

46-
// Keep track of whether the module was changed.
47-
auto Changed = false;
77+
switch (AT) {
78+
case ArchType::Cuda:
79+
// ADDRESS_SPACE_SHARED = 3,
80+
SharedASValue = 3;
81+
break;
82+
case ArchType::AMDHSA:
83+
// LOCAL_ADDRESS = 3,
84+
SharedASValue = 3;
85+
break;
86+
default:
87+
SharedASValue = 0;
88+
break;
89+
}
4890

49-
// Access `nvvm.annotations` to determine which functions are kernel entry
50-
// points.
51-
auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
52-
if (!NvvmMetadata)
91+
SmallVector<KernelPayload> Kernels;
92+
SmallVector<std::pair<Function *, KernelPayload>> NewToOldKernels;
93+
populateKernels(M, Kernels, AT);
94+
if (Kernels.empty())
5395
return false;
5496

55-
for (auto MetadataNode : NvvmMetadata->operands()) {
56-
if (MetadataNode->getNumOperands() != 3)
57-
continue;
97+
// Process the function and if changed, update the metadata.
98+
for (auto K : Kernels) {
99+
auto *NewKernel = processKernel(M, K.Kernel);
100+
if (NewKernel)
101+
NewToOldKernels.push_back(std::make_pair(NewKernel, K));
102+
}
58103

59-
// NVPTX identifies kernel entry points using metadata nodes of the form:
60-
// !X = !{<function>, !"kernel", i32 1}
61-
const MDOperand &TypeOperand = MetadataNode->getOperand(1);
62-
auto Type = dyn_cast<MDString>(TypeOperand);
63-
if (!Type)
64-
continue;
65-
// Only process kernel entry points.
66-
if (Type->getString() != "kernel")
67-
continue;
104+
if (NewToOldKernels.empty())
105+
return false;
68106

69-
// Get a pointer to the entry point function from the metadata.
70-
const MDOperand &FuncOperand = MetadataNode->getOperand(0);
71-
if (!FuncOperand)
72-
continue;
73-
auto FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
74-
if (!FuncConstant)
75-
continue;
76-
auto Func = dyn_cast<Function>(FuncConstant->getValue());
77-
if (!Func)
78-
continue;
107+
postProcessKernels(NewToOldKernels, AT);
79108

80-
// Process the function and if changed, update the metadata.
81-
auto NewFunc = this->ProcessFunction(M, Func);
82-
if (NewFunc) {
83-
Changed = true;
84-
MetadataNode->replaceOperandWith(
85-
0, llvm::ConstantAsMetadata::get(NewFunc));
86-
}
87-
}
109+
return true;
110+
}
88111

89-
return Changed;
112+
virtual llvm::StringRef getPassName() const override {
113+
return "SYCL Local Accessor to Shared Memory";
90114
}
91115

92-
Function *ProcessFunction(Module &M, Function *F) {
116+
private:
117+
Function *processKernel(Module &M, Function *F) {
93118
// Check if this function is eligible by having an argument that uses shared
94119
// memory.
95120
auto UsesLocalMemory = false;
96121
for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end();
97122
FA != FE; ++FA) {
98-
if (FA->getType()->isPointerTy()) {
99-
UsesLocalMemory =
100-
FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED;
101-
}
102-
if (UsesLocalMemory) {
123+
if (FA->getType()->isPointerTy() &&
124+
FA->getType()->getPointerAddressSpace() == SharedASValue) {
125+
UsesLocalMemory = true;
103126
break;
104127
}
105128
}
@@ -111,9 +134,9 @@ class LocalAccessorToSharedMemory : public ModulePass {
111134
// Create a global symbol to CUDA shared memory.
112135
auto SharedMemGlobalName = F->getName().str();
113136
SharedMemGlobalName.append("_shared_mem");
114-
auto SharedMemGlobalType =
137+
auto *SharedMemGlobalType =
115138
ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
116-
auto SharedMemGlobal = new GlobalVariable(
139+
auto *SharedMemGlobal = new GlobalVariable(
117140
/* Module= */ M,
118141
/* Type= */ &*SharedMemGlobalType,
119142
/* IsConstant= */ false,
@@ -122,7 +145,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
122145
/* Name= */ Twine{SharedMemGlobalName},
123146
/* InsertBefore= */ nullptr,
124147
/* ThreadLocalMode= */ GlobalValue::NotThreadLocal,
125-
/* AddressSpace= */ ADDRESS_SPACE_SHARED,
148+
/* AddressSpace= */ SharedASValue,
126149
/* IsExternallyInitialized= */ false);
127150
SharedMemGlobal->setAlignment(Align(4));
128151

@@ -139,7 +162,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
139162
for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end();
140163
FA != FE; ++FA, ++i) {
141164
if (FA->getType()->isPointerTy() &&
142-
FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED) {
165+
FA->getType()->getPointerAddressSpace() == SharedASValue) {
143166
// Replace pointers to shared memory with i32 offsets.
144167
Arguments.push_back(Type::getInt32Ty(M.getContext()));
145168
ArgumentAttributes.push_back(
@@ -178,8 +201,8 @@ class LocalAccessorToSharedMemory : public ModulePass {
178201
if (ArgumentReplaced[i]) {
179202
// If this argument was replaced, then create a `getelementptr`
180203
// instruction that uses it to recreate the pointer that was replaced.
181-
auto InsertBefore = &NF->getEntryBlock().front();
182-
auto PtrInst = GetElementPtrInst::CreateInBounds(
204+
auto *InsertBefore = &NF->getEntryBlock().front();
205+
auto *PtrInst = GetElementPtrInst::CreateInBounds(
183206
/* PointeeType= */ SharedMemGlobalType,
184207
/* Ptr= */ SharedMemGlobal,
185208
/* IdxList= */
@@ -191,7 +214,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
191214
// Then create a bitcast to make sure the new pointer is the same type
192215
// as the old one. This will only ever be a `i8 addrspace(3)*` to `i32
193216
// addrspace(3)*` type of cast.
194-
auto CastInst = new BitCastInst(PtrInst, FA->getType());
217+
auto *CastInst = new BitCastInst(PtrInst, FA->getType());
195218
CastInst->insertAfter(PtrInst);
196219
NewValueForUse = CastInst;
197220
}
@@ -217,11 +240,85 @@ class LocalAccessorToSharedMemory : public ModulePass {
217240
return NF;
218241
}
219242

220-
virtual llvm::StringRef getPassName() const {
221-
return "localaccessortosharedmemory";
243+
void populateCudaKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
244+
// Access `nvvm.annotations` to determine which functions are kernel entry
245+
// points.
246+
auto *NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
247+
if (!NvvmMetadata)
248+
return;
249+
250+
for (auto *MetadataNode : NvvmMetadata->operands()) {
251+
if (MetadataNode->getNumOperands() != 3)
252+
continue;
253+
254+
// NVPTX identifies kernel entry points using metadata nodes of the form:
255+
// !X = !{<function>, !"kernel", i32 1}
256+
const MDOperand &TypeOperand = MetadataNode->getOperand(1);
257+
auto *Type = dyn_cast<MDString>(TypeOperand);
258+
if (!Type)
259+
continue;
260+
// Only process kernel entry points.
261+
if (Type->getString() != "kernel")
262+
continue;
263+
264+
// Get a pointer to the entry point function from the metadata.
265+
const MDOperand &FuncOperand = MetadataNode->getOperand(0);
266+
if (!FuncOperand)
267+
continue;
268+
auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
269+
if (!FuncConstant)
270+
continue;
271+
auto *Func = dyn_cast<Function>(FuncConstant->getValue());
272+
if (!Func)
273+
continue;
274+
275+
Kernels.push_back(KernelPayload(Func, MetadataNode));
276+
}
277+
}
278+
279+
void populateAMDKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
280+
for (auto &F : M) {
281+
if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
282+
Kernels.push_back(KernelPayload(&F));
283+
}
222284
}
223-
};
224285

286+
void populateKernels(Module &M, SmallVector<KernelPayload> &Kernels,
287+
ArchType AT) {
288+
switch (AT) {
289+
case ArchType::Cuda:
290+
return populateCudaKernels(M, Kernels);
291+
case ArchType::AMDHSA:
292+
return populateAMDKernels(M, Kernels);
293+
default:
294+
llvm_unreachable("Unsupported arch type.");
295+
}
296+
}
297+
298+
void postProcessCudaKernels(
299+
SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
300+
for (auto &Pair : NewToOldKernels) {
301+
std::get<1>(Pair).MD->replaceOperandWith(
302+
0, llvm::ConstantAsMetadata::get(std::get<0>(Pair)));
303+
}
304+
}
305+
306+
void postProcessAMDKernels(
307+
SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {}
308+
309+
void postProcessKernels(
310+
SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels,
311+
ArchType AT) {
312+
switch (AT) {
313+
case ArchType::Cuda:
314+
return postProcessCudaKernels(NewToOldKernels);
315+
case ArchType::AMDHSA:
316+
return postProcessAMDKernels(NewToOldKernels);
317+
default:
318+
llvm_unreachable("Unsupported arch type.");
319+
}
320+
}
321+
};
225322
} // end anonymous namespace
226323

227324
char LocalAccessorToSharedMemory::ID = 0;

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
2525
FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
2626
void initializeAMDGPURegBankCombinerPass(PassRegistry &);
2727

28+
void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
29+
2830
// SI Passes
2931
FunctionPass *createGCNDPPCombinePass();
3032
FunctionPass *createSIAnnotateControlFlowPass();

0 commit comments

Comments
 (0)