[NFC][AMDGPU] Make AMDGPUSplitModule a ModulePass (#95773)

Pierre-vh · web-flow · commit d95b82c49aef · 2024-06-18T09:16:32.000+02:00
It allows it to access TTI correctly, and opens the door to accessing
more analysis in the future.

I went back and forth between this, and also making the default
SplitModule a Pass too to make it uniform, but I decided against it
because it's just needless complications. Neither llvm-split or
LTOBackend have a PM ready to use so we need to create one anyway. Let's
keep all the mess hidden in the AMDGPU version for now to keep this
change more self-contained.
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
@@ -431,7 +431,7 @@ class TargetMachine {
   /// and \p M has not been modified.
   virtual bool splitModule(
       Module &M, unsigned NumParts,
-      function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
+      function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
     return false;
   }
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -98,6 +98,7 @@ static cl::opt<bool>
 
 using CostType = InstructionCost::CostType;
 using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 
 static bool isEntryPoint(const Function *F) {
   return AMDGPU::isEntryFunctionCC(F->getCallingConv());
@@ -214,13 +215,12 @@ static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
 
 /// Calculate the cost of each function in \p M
 /// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+/// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
 static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
-                       Module &M,
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
                        DenseMap<const Function *, CostType> &CostMap) {
   CostType ModuleCost = 0;
   CostType KernelCost = 0;
@@ -230,8 +230,7 @@ calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
       continue;
 
     CostType FnCost = 0;
-    TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
+    const auto &TTI = GetTTI(Fn);
     for (const auto &BB : Fn) {
       for (const auto &I : BB) {
         auto Cost =
@@ -438,8 +437,9 @@ doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
   // assign X to a partition as usual, but when we get to Y, we check if it's
   // worth also putting it in Y's partition.
   const CostType LargeKernelThreshold =
-      LargeKernelFactor ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
-                        : std::numeric_limits<CostType>::max();
+      LargeKernelFactor
+          ? CostType(((ModuleCost / NumParts) * LargeKernelFactor))
+          : std::numeric_limits<CostType>::max();
 
   std::vector<DenseSet<const Function *>> Partitions;
   Partitions.resize(NumParts);
@@ -604,10 +604,9 @@ static void externalize(GlobalValue &GV) {
   if (!GV.hasName())
     GV.setName("__llvmsplit_unnamed");
 }
-} // end anonymous namespace
 
-void llvm::splitAMDGPUModule(
-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
+static void splitAMDGPUModule(
+    GetTTIFn GetTTI, Module &M, unsigned N,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
 
   SplitModuleLogger SML(M);
@@ -648,7 +647,7 @@ void llvm::splitAMDGPUModule(
   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
   DenseMap<const Function *, CostType> FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
+  const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
 
   // Gather every kernel into a WorkList, then sort it by descending total cost
   // of the kernel so the biggest kernels are seen first.
@@ -742,3 +741,16 @@ void llvm::splitAMDGPUModule(
       << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
       << "% of original module)\n";
 }
+} // namespace
+
+PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
+                                             ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+  splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+  // We don't change the original module.
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
@@ -12,18 +12,27 @@
 #define LLVM_TARGET_AMDGPUSPLITMODULE_H
 
 #include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/IR/PassManager.h"
 #include <memory>
 
 namespace llvm {
 
-class Module;
-class AMDGPUTargetMachine;
-
 /// Splits the module M into N linkable partitions. The function ModuleCallback
 /// is called N times passing each individual partition as the MPart argument.
-void splitAMDGPUModule(
-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
+class AMDGPUSplitModulePass : public PassInfoMixin<AMDGPUSplitModulePass> {
+public:
+  using ModuleCreationCallback =
+      function_ref<void(std::unique_ptr<Module> MPart)>;
+
+  AMDGPUSplitModulePass(unsigned N, ModuleCreationCallback ModuleCallback)
+      : N(N), ModuleCallback(ModuleCallback) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+  unsigned N;
+  ModuleCreationCallback ModuleCallback;
+};
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -829,8 +829,24 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
 
 bool AMDGPUTargetMachine::splitModule(
     Module &M, unsigned NumParts,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
-  splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+  // FIXME(?): Would be better to use an already existing Analysis/PassManager,
+  // but all current users of this API don't have one ready and would need to
+  // create one anyway. Let's hide the boilerplate for now to keep it simple.
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  PassBuilder PB(this);
+  PB.registerModuleAnalyses(MAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  ModulePassManager MPM;
+  MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
+  MPM.run(M, MAM);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -76,7 +76,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 
   bool splitModule(Module &M, unsigned NumParts,
                    function_ref<void(std::unique_ptr<Module> MPart)>
-                       ModuleCallback) const override;
+                       ModuleCallback) override;
 };
 
 //===----------------------------------------------------------------------===//

Original file line number	Diff line number	Diff line change
`@@ -431,7 +431,7 @@ class TargetMachine {`
`431`	`431`	`/// and \p M has not been modified.`
`432`	`432`	`virtual bool splitModule(`
`433`	`433`	`Module &M, unsigned NumParts,`
`434`		`- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {`
	`434`	`+ function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {`
`435`	`435`	`return false;`
`436`	`436`	`}`
`437`	`437`	`};`