-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS #89683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// The purpose of this pass is to ensure that the combined module contains | ||
// as many LDS global variables as there are kernels that (indirectly) access | ||
// them. As LDS variables behave like C++ static variables, it is important that | ||
// each partition contains a unique copy of the variable on a per kernel basis. | ||
// This representation also prepares the combined module to eliminate | ||
// cross-module false dependencies of LDS variables. This pass runs prior to the | ||
// AMDGPULowerModuleLDS pass in the fullLTO pipeline and is used to improve | ||
// the functionality of --lto-partitions. | ||
// | ||
// This pass operates as follows: | ||
// 1. Firstly, traverse the call graph from each kernel to determine the number | ||
// of kernels calling each device function. | ||
// 2. For each LDS global variable GV, determine the function F that defines it. | ||
// Collect it's caller functions. Clone F and GV, and finally insert a | ||
// call/invoke instruction in each caller function. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "AMDGPU.h" | ||
#include "Utils/AMDGPUMemoryUtils.h" | ||
#include "llvm/ADT/DepthFirstIterator.h" | ||
#include "llvm/ADT/Twine.h" | ||
#include "llvm/Analysis/CallGraph.h" | ||
#include "llvm/IR/InstrTypes.h" | ||
#include "llvm/IR/Instructions.h" | ||
#include "llvm/Passes/PassBuilder.h" | ||
#include "llvm/Support/ScopedPrinter.h" | ||
#include "llvm/Transforms/Utils/Cloning.h" | ||
|
||
using namespace llvm; | ||
using GVToFnMapTy = DenseMap<GlobalVariable *, Function *>; | ||
|
||
#define DEBUG_TYPE "amdgpu-clone-module-lds" | ||
|
||
static cl::opt<unsigned int> MaxCountForClonedFunctions( | ||
"clone-lds-functions-max-count", cl::init(16), cl::Hidden, | ||
cl::desc("Specify a limit to the number of clones of a function")); | ||
|
||
/// Return the function that defines \p GV | ||
/// \param GV The global variable in question | ||
/// \return The function defining \p GV | ||
static Function *getFunctionDefiningGV(GlobalVariable &GV) { | ||
SmallVector<User *> Worklist(GV.users()); | ||
while (!Worklist.empty()) { | ||
User *U = Worklist.pop_back_val(); | ||
if (auto *Inst = dyn_cast<Instruction>(U)) | ||
return Inst->getFunction(); | ||
if (auto *Op = dyn_cast<Operator>(U)) | ||
append_range(Worklist, Op->users()); | ||
} | ||
return nullptr; | ||
}; | ||
|
||
/// Return a map of LDS globals paired with the function defining them | ||
/// \param M Module in question | ||
/// \return Map of LDS global variables and their functions | ||
static GVToFnMapTy collectModuleGlobals(Module &M) { | ||
GVToFnMapTy GVToFnMap; | ||
for (auto &GA : M.aliases()) { | ||
if (auto *GV = dyn_cast<GlobalVariable>(GA.getAliaseeObject())) { | ||
if (AMDGPU::isLDSVariableToLower(*GV) && !GVToFnMap.contains(GV)) | ||
GVToFnMap.insert({GV, getFunctionDefiningGV(*GV)}); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to handle all cases. this will miss aliases and global initializers There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I am not sure what you mean by handling global initializers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If by global initializers you mean global ctors, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean you can initialize a global variable to contain a pointer value (e.g. in llvm.used). Instructions / Operators are not the only possible users:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So in your example, if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the special case of llvm.used, yes. I think we need to think harder about the general initializer case |
||
} | ||
|
||
for (auto &GV : M.globals()) { | ||
if (AMDGPU::isLDSVariableToLower(GV) && !GVToFnMap.contains(&GV)) | ||
GVToFnMap.insert({&GV, getFunctionDefiningGV(GV)}); | ||
} | ||
return GVToFnMap; | ||
} | ||
|
||
PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M, | ||
ModuleAnalysisManager &AM) { | ||
if (MaxCountForClonedFunctions.getValue() == 1) | ||
return PreservedAnalyses::all(); | ||
|
||
bool Changed = false; | ||
auto &CG = AM.getResult<CallGraphAnalysis>(M); | ||
|
||
// For each function in the call graph, determine the number | ||
// of ancestor-caller kernels. | ||
DenseMap<Function *, unsigned int> KernelRefsToFuncs; | ||
for (auto &Fn : M) { | ||
if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL) | ||
continue; | ||
for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I) { | ||
if (auto *F = I->getFunction()) | ||
KernelRefsToFuncs[F]++; | ||
} | ||
} | ||
|
||
GVToFnMapTy GVToFnMap = collectModuleGlobals(M); | ||
for (auto [GV, OldF] : GVToFnMap) { | ||
LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function " | ||
<< OldF->getName() << '\n'); | ||
|
||
// Collect all call instructions to OldF | ||
SmallVector<Instruction *> InstsCallingOldF; | ||
for (auto &I : OldF->uses()) { | ||
if (auto *CI = dyn_cast<CallBase>(I.getUser())) | ||
InstsCallingOldF.push_back(CI); | ||
} | ||
|
||
// Create as many clones of the function containing LDS global as | ||
// there are kernels calling the function (including the function | ||
// already defining the LDS global). Respectively, clone the | ||
// LDS global and the call instructions to the function. | ||
LLVM_DEBUG(dbgs() << "\tFunction is referenced by " | ||
<< KernelRefsToFuncs[OldF] << " kernels.\n"); | ||
for (unsigned int ID = 0; | ||
ID + 1 < std::min(KernelRefsToFuncs[OldF], | ||
MaxCountForClonedFunctions.getValue()); | ||
++ID) { | ||
// Clone LDS global variable | ||
auto *NewGV = new GlobalVariable( | ||
M, GV->getValueType(), GV->isConstant(), GlobalValue::InternalLinkage, | ||
PoisonValue::get(GV->getValueType()), | ||
GV->getName() + ".clone." + Twine(ID), GV, | ||
GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); | ||
NewGV->copyAttributesFrom(GV); | ||
NewGV->copyMetadata(GV, 0); | ||
NewGV->setComdat(GV->getComdat()); | ||
LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName() | ||
<< '\n'); | ||
|
||
// Clone function | ||
ValueToValueMapTy VMap; | ||
VMap[GV] = NewGV; | ||
auto *NewF = CloneFunction(OldF, VMap); | ||
NewF->setName(OldF->getName() + ".clone." + Twine(ID)); | ||
LLVM_DEBUG(dbgs() << "Inserting function clone with name " | ||
<< NewF->getName() << '\n'); | ||
|
||
// Create a new CallInst to call the cloned function | ||
for (auto *Inst : InstsCallingOldF) { | ||
Instruction *I = Inst->clone(); | ||
I->setName(Inst->getName() + ".clone." + Twine(ID)); | ||
if (auto *CI = dyn_cast<CallBase>(I)) | ||
CI->setCalledOperand(NewF); | ||
I->insertAfter(Inst); | ||
LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n'); | ||
} | ||
gandhi56 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Changed = true; | ||
} | ||
} | ||
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s | ||
|
||
; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t | ||
; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a | ||
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s | ||
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s | ||
|
||
target triple = "amdgcn-amd-amdhsa" | ||
|
||
; Before transformation, After transformation, | ||
; K1 K2 K1 K2 | ||
; | / | / | ||
; | / | / | ||
; A ==> A | ||
; | \ | \ | ||
; | \ | \ | ||
; B C B C | ||
; | | \ | ||
; X X1 X2 | ||
; | ||
; where X contains an LDS reference | ||
|
||
; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16 | ||
; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 | ||
@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16 | ||
|
||
define protected amdgpu_kernel void @kernel1(i32 %n) #3 { | ||
; CHECK-LABEL: define protected amdgpu_kernel void @kernel1( | ||
; CHECK-SAME: i32 [[N:%.*]]) { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) | ||
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
%call = call i32 @A(i32 %n) | ||
ret void | ||
} | ||
|
||
define protected amdgpu_kernel void @kernel2(i32 %n) #3 { | ||
; CHECK-LABEL: define protected amdgpu_kernel void @kernel2( | ||
; CHECK-SAME: i32 [[N:%.*]]) { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[CALL:%.*]] = call i32 @A(i32 [[N]]) | ||
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
%call = call i32 @A(i32 %n) | ||
ret void | ||
} | ||
|
||
define void @A() { | ||
; CHECK-LABEL: define void @A() { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: call void @B() | ||
; CHECK-NEXT: call void @C() | ||
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
call void @B() | ||
call void @C() | ||
ret void | ||
} | ||
|
||
define i32 @B() { | ||
; CHECK-LABEL: define i32 @B() { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[P:%.*]] = alloca i32, align 4 | ||
; CHECK-NEXT: store i32 5, ptr [[P]], align 4 | ||
; CHECK-NEXT: [[RET:%.*]] = call i32 @X(ptr [[P]]) | ||
; CHECK-NEXT: [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]]) | ||
; CHECK-NEXT: ret i32 [[RET]] | ||
; | ||
entry: | ||
%p = alloca i32 | ||
store i32 5, ptr %p | ||
%ret = call i32 @X(ptr %p) | ||
ret i32 %ret | ||
} | ||
|
||
define void @C() { | ||
; CHECK-LABEL: define void @C() { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
ret void | ||
} | ||
|
||
define i32 @X(ptr %x) { | ||
; CHECK-LABEL: define i32 @X( | ||
; CHECK-SAME: ptr [[X:%.*]]) { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0 | ||
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[X]], align 4 | ||
; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 | ||
; CHECK-NEXT: ret i32 [[V]] | ||
; | ||
entry: | ||
%p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0 | ||
%v = load i32, ptr %x | ||
store i32 %v, ptr %p | ||
ret i32 %v | ||
} | ||
|
||
; CHECK-LABEL: define i32 @X.clone.0(ptr %x) { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0 | ||
; CHECK-NEXT: %v = load i32, ptr %x, align 4 | ||
; CHECK-NEXT: store i32 %v, ptr %p, align 4 | ||
; CHECK-NEXT: ret i32 %v | ||
|
||
; MOD0: {{.*}} addrspace(3) global [64 x i32] undef, align 16 | ||
; MOD0: define i32 @X(ptr %x) | ||
|
||
; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16 | ||
; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n) | ||
; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n) | ||
; MOD1: define void @A() | ||
; MOD1: define i32 @B() | ||
; MOD1: define i32 @X.clone.0(ptr %x) |
Uh oh!
There was an error while loading. Please reload this page.