llvm · gandhi56 · Feb 15, 2024 · May 7, 2024 · May 13, 2024 · arsenm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+struct AMDGPUCloneModuleLDSPass
+    : public PassInfoMixin<AMDGPUCloneModuleLDSPass> {
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp
@@ -0,0 +1,156 @@
+//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to ensure that the combined module contains
+// as many LDS global variables as there are kernels that (indirectly) access
+// them. As LDS variables behave like C++ static variables, it is important that
+// each partition contains a unique copy of the variable on a per kernel basis.
+// This representation also prepares the combined module to eliminate
+// cross-module false dependencies of LDS variables. This pass runs prior to the
+// AMDGPULowerModuleLDS pass in the fullLTO pipeline and is used to improve
+// the functionality of --lto-partitions.
+//
+// This pass operates as follows:
+// 1. Firstly, traverse the call graph from each kernel to determine the number
+//    of kernels calling each device function.
+// 2. For each LDS global variable GV, determine the function F that defines it.
+//    Collect it's caller functions. Clone F and GV, and finally insert a
+//    call/invoke instruction in each caller function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+using GVToFnMapTy = DenseMap<GlobalVariable *, Function *>;
+
+#define DEBUG_TYPE "amdgpu-clone-module-lds"
+
+static cl::opt<unsigned int> MaxCountForClonedFunctions(
+    "clone-lds-functions-max-count", cl::init(16), cl::Hidden,
+    cl::desc("Specify a limit to the number of clones of a function"));
+
+/// Return the function that defines \p GV
+/// \param GV The global variable in question
+/// \return The function defining \p GV
+static Function *getFunctionDefiningGV(GlobalVariable &GV) {
+  SmallVector<User *> Worklist(GV.users());
+  while (!Worklist.empty()) {
+    User *U = Worklist.pop_back_val();
+    if (auto *Inst = dyn_cast<Instruction>(U))
+      return Inst->getFunction();
+    if (auto *Op = dyn_cast<Operator>(U))
+      append_range(Worklist, Op->users());
+  }
+  return nullptr;
+};
+
+/// Return a map of LDS globals paired with the function defining them
+/// \param M Module in question
+/// \return Map of LDS global variables and their functions
+static GVToFnMapTy collectModuleGlobals(Module &M) {
+  GVToFnMapTy GVToFnMap;
+  for (auto &GA : M.aliases()) {
+    if (auto *GV = dyn_cast<GlobalVariable>(GA.getAliaseeObject())) {
+      if (AMDGPU::isLDSVariableToLower(*GV) && !GVToFnMap.contains(GV))
+        GVToFnMap.insert({GV, getFunctionDefiningGV(*GV)});
+    }
+  }
+
+  for (auto &GV : M.globals()) {
+    if (AMDGPU::isLDSVariableToLower(GV) && !GVToFnMap.contains(&GV))
+      GVToFnMap.insert({&GV, getFunctionDefiningGV(GV)});
+  }
+  return GVToFnMap;
+}
+
+PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (MaxCountForClonedFunctions.getValue() == 1)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  // For each function in the call graph, determine the number
+  // of ancestor-caller kernels.
+  DenseMap<Function *, unsigned int> KernelRefsToFuncs;
+  for (auto &Fn : M) {
+    if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+      continue;
+    for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I) {
+      if (auto *F = I->getFunction())
+        KernelRefsToFuncs[F]++;
+    }
+  }
+
+  GVToFnMapTy GVToFnMap = collectModuleGlobals(M);
+  for (auto [GV, OldF] : GVToFnMap) {
+    LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
+                      << OldF->getName() << '\n');
+
+    // Collect all call instructions to OldF
+    SmallVector<Instruction *> InstsCallingOldF;
+    for (auto &I : OldF->uses()) {
+      if (auto *CI = dyn_cast<CallBase>(I.getUser()))
+        InstsCallingOldF.push_back(CI);
+    }
+
+    // Create as many clones of the function containing LDS global as
+    // there are kernels calling the function (including the function
+    // already defining the LDS global). Respectively, clone the
+    // LDS global and the call instructions to the function.
+    LLVM_DEBUG(dbgs() << "\tFunction is referenced by "
+                      << KernelRefsToFuncs[OldF] << " kernels.\n");
+    for (unsigned int ID = 0;
+         ID + 1 < std::min(KernelRefsToFuncs[OldF],
+                           MaxCountForClonedFunctions.getValue());
+         ++ID) {
+      // Clone LDS global variable
+      auto *NewGV = new GlobalVariable(
+          M, GV->getValueType(), GV->isConstant(), GlobalValue::InternalLinkage,
+          PoisonValue::get(GV->getValueType()),
+          GV->getName() + ".clone." + Twine(ID), GV,
+          GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+      NewGV->copyAttributesFrom(GV);
+      NewGV->copyMetadata(GV, 0);
+      NewGV->setComdat(GV->getComdat());
+      LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName()
+                        << '\n');
+
+      // Clone function
+      ValueToValueMapTy VMap;
+      VMap[GV] = NewGV;
+      auto *NewF = CloneFunction(OldF, VMap);
+      NewF->setName(OldF->getName() + ".clone." + Twine(ID));
+      LLVM_DEBUG(dbgs() << "Inserting function clone with name "
+                        << NewF->getName() << '\n');
+
+      // Create a new CallInst to call the cloned function
+      for (auto *Inst : InstsCallingOldF) {
+        Instruction *I = Inst->clone();
+        I->setName(Inst->getName() + ".clone." + Twine(ID));
+        if (auto *CI = dyn_cast<CallBase>(I))
+          CI->setCalledOperand(NewF);
+        I->insertAfter(Inst);
+        LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n');
+      }
+      Changed = true;
+    }
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass())
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
+        PM.addPass(AMDGPUCloneModuleLDSPass());
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
       });

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
   AMDGPUCallLowering.cpp
+  AMDGPUCloneModuleLDS.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp

diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-ancestor-kernels.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function-ancestor-kernels.ll
@@ -0,0 +1,120 @@
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S -o %t
+; RUN: llvm-split -o %t %t -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=MOD0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=MOD1 %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; Before transformation,                    After transformation,
+;  K1  K2                                    K1  K2
+;  |  /                                      |  /
+;  | /                                       | /
+;  A                         ==>             A
+;  | \                                       | \
+;  |  \                                      |  \
+;  B   C                                     B   C
+;  |                                         | \
+;  X                                         X1 X2
+;
+; where X contains an LDS reference
+
+; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+@lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define void @A() {
+; CHECK-LABEL: define void @A() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @B()
+; CHECK-NEXT:    call void @C()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @B()
+  call void @C()
+  ret void
+}
+
+define i32 @B() {
+; CHECK-LABEL: define i32 @B() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @X(ptr [[P]])
+; CHECK-NEXT:    [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %p = alloca i32
+  store i32 5, ptr %p
+  %ret = call i32 @X(ptr %p)
+  ret i32 %ret
+}
+
+define void @C() {
+; CHECK-LABEL: define void @C() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define i32 @X(ptr %x) {
+; CHECK-LABEL: define i32 @X(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  %v = load i32, ptr %x
+  store i32 %v, ptr %p
+  ret i32 %v
+}
+
+; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   %v = load i32, ptr %x, align 4
+; CHECK-NEXT:   store i32 %v, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %v
+
+; MOD0: {{.*}} addrspace(3) global [64 x i32] undef, align 16
+; MOD0: define i32 @X(ptr %x)
+
+; MOD1: {{.*}} addrspace(3) global [64 x i32] poison, align 16
+; MOD1: define protected amdgpu_kernel void @kernel1(i32 %n)
+; MOD1: define protected amdgpu_kernel void @kernel2(i32 %n)
+; MOD1: define void @A()
+; MOD1: define i32 @B()
+; MOD1: define i32 @X.clone.0(ptr %x)