[AMD][Atomic] Introduceruntime LDS reduction algorithm for atomicRmwOp

Algorithm description: 1. Sort {ptr, operand} among the threads within the warp via bitonic sort based on DPP and Permute operations; 2. Distribute threads between groups defined by pointers, define a group for each thread by analizing neighbours with DPP instructions; 3. Select master thread for each group using exec mask; 4. Collect partial sum in LDS for each group via DS_ADD-like instructions 5. Utilize global atomic operation for each group by master thread As far as described algoritm requires additional memory, size calculating should be done ion the target dependent code. For this purpose `SetSpecificAllocationSize` pass was introduced, it sets `allocation.size` attribute for required operation. This attribute has highest priority during LDS size calculation. Signed-off-by: Ilya Veselov <iveselov.nn@gmail.com>
triton-lang · Dec 26, 2024 · 28d0ea1 · 28d0ea1
1 parent 134b3eb
commit 28d0ea1
Show file tree

Hide file tree

Showing 10 changed files with 414 additions and 23 deletions.
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -117,6 +117,9 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 }
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
+  if (op->hasAttr("allocation.size")) {
+    return op->getAttrOfType<mlir::IntegerAttr>("allocation.size").getInt();
+  }
   if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
     ReduceOpHelper helper(reduceOp);
     return helper.getScratchSizeInBytes();

diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -267,6 +267,7 @@ def make_llir(src, metadata, options):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
         amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
+        amd.passes.ttgpuir.add_set_specific_allocation_size(pm, options.arch)
         # custom_lds_size is an experimental parameter that defines amount of LDS available
         # for one thread block. Measured in bytes.
         #

diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
@@ -26,6 +26,9 @@ namespace mlir::triton::AMD {
 std::unique_ptr<OperationPass<ModuleOp>>
 createDecomposeUnsupportedConversionsPass(StringRef targetArch);
 
+std::unique_ptr<OperationPass<ModuleOp>>
+createSetSpecificAllocationSizePass(StringRef targetArch);
+
 /// @brief Creates pass that keep LDS consumption within specified limits.
 /// @param arch target architecture name, for example "gfx940"
 /// @param customLDSLimit defines LDS size available for one thread block

diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
@@ -13,6 +13,16 @@ def DecomposeUnsupportedAMDConversions : Pass<"decompose-unsupported-amd-convers
     ];
 }
 
+def SetSpecificAllocationSize : Pass<"set-specific-allocation-size", "mlir::ModuleOp"> {
+    let summary = "Set `allocation.size` attribute to make a hint to allocator that operation requires LDS.";
+    let constructor = "mlir::triton::AMD::createSetSpecificAllocationSizePass(\"\")";
+
+    let options = [
+        Option<"arch", "arch", "std::string", /*default*/"\"\"",
+               "gfx target device architecture, e.g., gfx942">,
+    ];
+}
+
 def OptimizeAMDLDSUsage : Pass<"optimize-amd-lds-usage", "mlir::ModuleOp"> {
     let summary = "Minimize LDS usage";
     let constructor = "mlir::triton::AMD::createOptimizeLDSUsagePass(\"\")";

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
@@ -16,6 +16,7 @@ add_triton_library(TritonAMDGPUToLLVM
     TargetInfo.cpp
     TargetUtils.cpp
     DecomposeUnsupportedConversions.cpp
+    SetSpecificAllocationSizePass.cpp
     OptimizeLDSUsage.cpp
     OptimizeLDSUtility.cpp
     SPMDOpToLLVM.cpp