Skip to content

Commit

Permalink
[AMD][Atomic] Introduceruntime LDS reduction algorithm for atomicRmwOp
Browse files Browse the repository at this point in the history
Algorithm description:
1. Sort {ptr, operand} among the threads within the warp
   via bitonic sort based on DPP and Permute operations;
2. Distribute threads between groups defined by pointers,
   define a group for each thread by analizing neighbours
   with DPP instructions;
3. Select master thread for each group using exec mask;
4. Collect partial sum in LDS for each group via
   DS_ADD-like instructions
5. Utilize global atomic operation for each group by
   master thread

As far as described algoritm requires additional memory, size calculating
should be done ion the target dependent code.
For this purpose `SetSpecificAllocationSize` pass was introduced, it sets
`allocation.size` attribute for required operation. This attribute has
highest priority during LDS size calculation.

Signed-off-by: Ilya Veselov <iveselov.nn@gmail.com>
  • Loading branch information
joviliast committed Dec 26, 2024
1 parent 134b3eb commit 28d0ea1
Show file tree
Hide file tree
Showing 10 changed files with 414 additions and 23 deletions.
3 changes: 3 additions & 0 deletions lib/Analysis/Allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
}

unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
if (op->hasAttr("allocation.size")) {
return op->getAttrOfType<mlir::IntegerAttr>("allocation.size").getInt();
}
if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
ReduceOpHelper helper(reduceOp);
return helper.getScratchSizeInBytes();
Expand Down
1 change: 1 addition & 0 deletions third_party/amd/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ def make_llir(src, metadata, options):
pm = ir.pass_manager(mod.context)
pm.enable_debug()
amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
amd.passes.ttgpuir.add_set_specific_allocation_size(pm, options.arch)
# custom_lds_size is an experimental parameter that defines amount of LDS available
# for one thread block. Measured in bytes.
#
Expand Down
3 changes: 3 additions & 0 deletions third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ namespace mlir::triton::AMD {
std::unique_ptr<OperationPass<ModuleOp>>
createDecomposeUnsupportedConversionsPass(StringRef targetArch);

std::unique_ptr<OperationPass<ModuleOp>>
createSetSpecificAllocationSizePass(StringRef targetArch);

/// @brief Creates pass that keep LDS consumption within specified limits.
/// @param arch target architecture name, for example "gfx940"
/// @param customLDSLimit defines LDS size available for one thread block
Expand Down
10 changes: 10 additions & 0 deletions third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ def DecomposeUnsupportedAMDConversions : Pass<"decompose-unsupported-amd-convers
];
}

def SetSpecificAllocationSize : Pass<"set-specific-allocation-size", "mlir::ModuleOp"> {
let summary = "Set `allocation.size` attribute to make a hint to allocator that operation requires LDS.";
let constructor = "mlir::triton::AMD::createSetSpecificAllocationSizePass(\"\")";

let options = [
Option<"arch", "arch", "std::string", /*default*/"\"\"",
"gfx target device architecture, e.g., gfx942">,
];
}

def OptimizeAMDLDSUsage : Pass<"optimize-amd-lds-usage", "mlir::ModuleOp"> {
let summary = "Minimize LDS usage";
let constructor = "mlir::triton::AMD::createOptimizeLDSUsagePass(\"\")";
Expand Down
1 change: 1 addition & 0 deletions third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ add_triton_library(TritonAMDGPUToLLVM
TargetInfo.cpp
TargetUtils.cpp
DecomposeUnsupportedConversions.cpp
SetSpecificAllocationSizePass.cpp
OptimizeLDSUsage.cpp
OptimizeLDSUtility.cpp
SPMDOpToLLVM.cpp
Expand Down
Loading

0 comments on commit 28d0ea1

Please sign in to comment.