Skip to content

Commit

Permalink
Add DMA composition pass (#729)
Browse files Browse the repository at this point in the history
Add a pass that iteratively calls the `DmaLoopSubsumption` and
`CombineStridedOps` pattern rewriters as both can enable new composition
opportunities for each other.
  • Loading branch information
jtuyls authored Aug 30, 2024
1 parent cc68d51 commit 37bb7f1
Show file tree
Hide file tree
Showing 13 changed files with 311 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/Transforms/Transforms.h"
#include "llvm/ADT/STLExtras.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

Expand Down Expand Up @@ -159,7 +160,7 @@ void AMDAIECombineStridedOpsPass::runOnOperation() {
Operation *parentOp = getOperation();
MLIRContext *context = &getContext();
RewritePatternSet patterns(context);
patterns.insert<CombineStridedOps>(context);
populateStridedOpCombinationPattern(patterns);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError("failed to combine strided operations");
return signalPassFailure();
Expand All @@ -168,6 +169,10 @@ void AMDAIECombineStridedOpsPass::runOnOperation() {

} // namespace

void populateStridedOpCombinationPattern(RewritePatternSet &patterns) {
patterns.insert<CombineStridedOps>(patterns.getContext());
}

std::unique_ptr<Pass> createAMDAIECombineStridedOpsPass() {
return std::make_unique<AMDAIECombineStridedOpsPass>();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file composes more complex strided DMA ops by iteratively:
// 1. Combining ops in the same block.
// 2. Subsuming loop iterations into the strided access pattern.
//
//===----------------------------------------------------------------------===//

#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/Transforms/Transforms.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#define DEBUG_TYPE "iree-amdaie-dma-composition"

namespace mlir::iree_compiler::AMDAIE {

namespace {

class AMDAIEDmaCompositionPass
: public impl::AMDAIEDmaCompositionBase<AMDAIEDmaCompositionPass> {
public:
AMDAIEDmaCompositionPass() = default;
AMDAIEDmaCompositionPass(const AMDAIEDmaCompositionPass &pass){};
AMDAIEDmaCompositionPass(const AMDAIEDmaCompositionOptions &options)
: AMDAIEDmaCompositionBase(options) {}
void runOnOperation() override;
};

void AMDAIEDmaCompositionPass::runOnOperation() {
Operation *parentOp = getOperation();
MLIRContext *context = &getContext();
RewritePatternSet patterns(context);
{
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
if (!maybeDevice) {
parentOp->emitOpError()
<< "has no AMDAIEDevice in the target attribute configuration. This "
"device-specific information is required to determine when loops "
"can be subsumed into DMA operations, and must be attached to a "
"containing ModuleOp.";
return signalPassFailure();
}
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());
populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel),
onlyZeroStrideOnOuterDim);
}
populateStridedOpCombinationPattern(patterns);
if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError("failed to compose strided operations");
return signalPassFailure();
}

IRRewriter rewriter(parentOp->getContext());
if (failed(moveNpuDmaSyncUsersAfterAncestorInSameBlock(rewriter, parentOp))) {
parentOp->emitOpError() << "failed to move DMA users to correct scope "
"after strided op composition";
return signalPassFailure();
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIEDmaCompositionPass(
AMDAIEDmaCompositionOptions options) {
return std::make_unique<AMDAIEDmaCompositionPass>(options);
}

} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/Transforms/Transforms.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
Expand All @@ -49,15 +50,6 @@ int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound,

namespace {

/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
Operation *getAncestorInBlock(Operation *op, Block *block) {
if (!op || !block) return nullptr;
auto parent = op;
while (parent && (parent->getBlock() != block))
parent = parent->getParentOp();
return parent;
}

/// Utility affine expression visitor to retrieve the scale and optional bias
/// from the expression.
struct RetrieveScaleAndBias
Expand Down Expand Up @@ -112,31 +104,6 @@ struct RetrieveScaleAndBias
}
};

/// Utility to clean up the DMA users after loop subsumption + hoisting. This
/// will hoist `amdaie.npu.dma_cpy_nd`'s users like `npu.dma_wait` as well.
LogicalResult moveUsersToHoistedDMAScope(Operation *parentOp) {
IRRewriter rewriter(parentOp->getContext());
// Move `amdaie.npu.dma_wait` operation after the parent op in the same block
// as the input `amdaie.npu.dma_cpy_nd` operation. This parent op will
// typically be a loop out of which the DMA operation has been hoisted. Moving
// the wait operation after this loop is important to avoid a deadlock with
// whatever operations are still remaining inside the loop's scope.
WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) {
Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp();
Operation *ancestorInSameBlock =
getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock());
if (!ancestorInSameBlock) {
npuDmaWaitOp->emitOpError(
"doesn't have an ancestor in the same scope as the source DMA op");
return WalkResult::interrupt();
}
rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock);
return WalkResult::advance();
});
if (res.wasInterrupted()) return failure();
return success();
}

struct SubsumeLoopIntoDMA
: public OpInterfaceRewritePattern<AMDAIE::DoublyStridedOpInterface> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
Expand Down Expand Up @@ -594,7 +561,7 @@ class AMDAIEDmaLoopSubsumptionPass
}

AMDAIEDmaLoopSubsumptionPass() = default;
AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass) {};
AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass){};
AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionOptions &options)
: AMDAIEDmaLoopSubsumptionBase(options) {}
void runOnOperation() override;
Expand All @@ -605,7 +572,6 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() {
MLIRContext *context = &getContext();

RewritePatternSet patterns(context);

{
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
Expand All @@ -619,19 +585,17 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() {
}
AMDAIE::AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(maybeDevice.value());

SubsumeLoopIntoDMA pattern(context, std::move(deviceModel),
onlyZeroStrideOnOuterDim);

patterns.insert<SubsumeLoopIntoDMA>(std::move(pattern));
populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel),
onlyZeroStrideOnOuterDim);
}

if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) {
parentOp->emitOpError("failed to subsume some loops into DMA operations");
return signalPassFailure();
}

if (failed(moveUsersToHoistedDMAScope(parentOp))) {
IRRewriter rewriter(parentOp->getContext());
if (failed(moveNpuDmaSyncUsersAfterAncestorInSameBlock(rewriter, parentOp))) {
parentOp->emitOpError(
"failed to move DMA users to correct scope after loop subsumption");
return signalPassFailure();
Expand All @@ -640,6 +604,14 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() {

} // namespace

void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns,
AMDAIE::AMDAIEDeviceModel &&deviceModel,
bool onlyZeroStrideOnOuterDim) {
SubsumeLoopIntoDMA pattern(patterns.getContext(), std::move(deviceModel),
onlyZeroStrideOnOuterDim);
patterns.insert<SubsumeLoopIntoDMA>(std::move(pattern));
}

std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass(
AMDAIEDmaLoopSubsumptionOptions options) {
return std::make_unique<AMDAIEDmaLoopSubsumptionPass>(options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@

namespace mlir::iree_compiler::AMDAIE {

/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor.
Operation *getAncestorInBlock(Operation *op, Block *block) {
if (!op || !block) return nullptr;
auto parent = op;
while (parent && (parent->getBlock() != block))
parent = parent->getParentOp();
return parent;
}

/// Utility to retrieve a constant index from an OpFoldResult.
int64_t getConstantIndexOrAssert(OpFoldResult dim) {
std::optional<int64_t> size = getConstantIntValue(dim);
Expand Down Expand Up @@ -317,4 +326,22 @@ LogicalResult foldUnitDims(const SmallVector<OpFoldResult> &offsets,
return success(foldableUnitDimsFound);
}

LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
RewriterBase &rewriter, Operation *parentOp) {
WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) {
Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp();
Operation *ancestorInSameBlock =
getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock());
if (!ancestorInSameBlock) {
npuDmaWaitOp->emitOpError(
"doesn't have an ancestor in the same scope as the source DMA op");
return WalkResult::interrupt();
}
rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock);
return WalkResult::advance();
});
if (res.wasInterrupted()) return failure();
return success();
}

} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "iree-amd-aie/IR/AMDAIEAttrs.h"
#include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h"
#include "iree-amd-aie/IR/AMDAIEOps.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "llvm/ADT/SmallVector.h"
#include "mlir/IR/MLIRContext.h"
Expand Down Expand Up @@ -301,6 +302,15 @@ struct DmaDimConfig {
}
};

/// Utility to move the synchronization users (`amdaie.npu.dma_wait`) directly
/// after its ancestor in the same block as the DMA operation it's synchronizing
/// on. This utility can be used for cleanup after DMA transformations to avoid
/// deadlocks and/or ensure SSA dominance. The idea is to ensure correct
/// synchronization by not influencing whatever is happening in between the
/// async DMA operation and its synchronization op.
LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
RewriterBase &rewriter, Operation *parentOp);

} // namespace mlir::iree_compiler::AMDAIE

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ iree_cc_library(
"AMDAIECreateLogicalObjectFifoLink.cpp"
"AMDAIECreateReferenceToAllocation.cpp"
"AMDAIEDistributeCoresAndObjectFifos.cpp"
"AMDAIEDmaComposition.cpp"
"AMDAIEDmaLoopSubsumption.cpp"
"AMDAIEDmaToCircularDma.cpp"
"AMDAIEDmaUtils.cpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ namespace mlir::iree_compiler::AMDAIE {
#define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION
#define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR
#define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS
#define GEN_PASS_DEF_AMDAIEDMACOMPOSITION
#define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION
#define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA
#define GEN_PASS_DEF_AMDAIEFLATTENLOGICALOBJECTFIFO
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -608,11 +608,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) {
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIEDmaLoopSubsumptionPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());

passManager.addPass(createAMDAIECombineStridedOpsPass());
passManager.addPass(createAMDAIEDmaCompositionPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ std::unique_ptr<Pass> createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass();
/// operations and distribute the logical objectFifos.
std::unique_ptr<Pass> createAMDAIEDistributeCoresAndObjectFifosPass();

/// Create a pass to compose more complex DMA operations, e.g. by combining DMA
/// operations and/or subsuming loop iterations into the strided access
/// patterns.
std::unique_ptr<Pass> createAMDAIEDmaCompositionPass(
AMDAIEDmaCompositionOptions options = {});

/// Create a pass to subsume loop iterations into DMA operations' access
/// patterns.
std::unique_ptr<Pass> createAMDAIEDmaLoopSubsumptionPass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,17 @@ def AMDAIEDistributeCoresAndObjectFifos :
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()";
}

def AMDAIEDmaComposition :
Pass<"iree-amdaie-dma-composition"> {
let summary = "Compose DMA operations by DMA combination and loop subsumption.";
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDmaCompositionPass()";
let options = [
Option<"onlyZeroStrideOnOuterDim", "only-zero-stride-on-outer-dim", "bool", /*default=*/"true",
"Whether a stride of zero indicating a repeat is only supported on the "
"outer dimension. This is the case of AIE2(+).">
];
}

def AMDAIEDmaLoopSubsumption :
Pass<"iree-amdaie-dma-loop-subsumption"> {
let summary = "Subsume loop iterations into DMA operations' access patterns.";
Expand Down Expand Up @@ -459,7 +470,6 @@ def AMDAIESinkIntoCore :
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()";
}


def AMDAIETile :
InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> {
let summary = "Pass to tile TilingInterface operations.";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define IREE_AMD_AIE_TRANSFORMS_AMDAIETRANSFORMS_H_

#include "iree-amd-aie/IR/AMDAIEOps.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"

Expand Down Expand Up @@ -38,6 +39,14 @@ LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp);
LogicalResult normalizeLoopBounds(RewriterBase &rewriter,
scf::ForallOp forallOp);

/// Populate patterns that subsume loops iterations into DMA access patterns.
void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns,
AMDAIE::AMDAIEDeviceModel &&deviceModel,
bool onlyZeroStrideOnOuterDim);

/// Populate patterns that combine strided ops in the same block.
void populateStridedOpCombinationPattern(RewritePatternSet &patterns);

} // namespace mlir::iree_compiler::AMDAIE

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ iree_lit_test_suite(
"create_reference_to_allocation.mlir"
"disable_vectorization.mlir"
"distribute_cores_and_objectfifos.mlir"
"dma_composition.mlir"
"dma_loop_subsumption.mlir"
"dma_to_circular_dma.mlir"
"flatten_logical_objectfifo.mlir"
Expand Down
Loading

0 comments on commit 37bb7f1

Please sign in to comment.