diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp index 1f70c82ac..db5d6bd76 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIECombineStridedOps.cpp @@ -15,6 +15,7 @@ #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" #include "llvm/ADT/STLExtras.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -159,7 +160,7 @@ void AMDAIECombineStridedOpsPass::runOnOperation() { Operation *parentOp = getOperation(); MLIRContext *context = &getContext(); RewritePatternSet patterns(context); - patterns.insert(context); + populateStridedOpCombinationPattern(patterns); if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { parentOp->emitOpError("failed to combine strided operations"); return signalPassFailure(); @@ -168,6 +169,10 @@ void AMDAIECombineStridedOpsPass::runOnOperation() { } // namespace +void populateStridedOpCombinationPattern(RewritePatternSet &patterns) { + patterns.insert(patterns.getContext()); +} + std::unique_ptr createAMDAIECombineStridedOpsPass() { return std::make_unique(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp new file mode 100644 index 000000000..3991e4731 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaComposition.cpp @@ -0,0 +1,79 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file composes more complex strided DMA ops by iteratively: +// 1. Combining ops in the same block. +// 2. Subsuming loop iterations into the strided access pattern. +// +//===----------------------------------------------------------------------===// + +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-dma-composition" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +class AMDAIEDmaCompositionPass + : public impl::AMDAIEDmaCompositionBase { + public: + AMDAIEDmaCompositionPass() = default; + AMDAIEDmaCompositionPass(const AMDAIEDmaCompositionPass &pass){}; + AMDAIEDmaCompositionPass(const AMDAIEDmaCompositionOptions &options) + : AMDAIEDmaCompositionBase(options) {} + void runOnOperation() override; +}; + +void AMDAIEDmaCompositionPass::runOnOperation() { + Operation *parentOp = getOperation(); + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + { + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine when loops " + "can be subsumed into DMA operations, and must be attached to a " + "containing ModuleOp."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), + onlyZeroStrideOnOuterDim); + } + populateStridedOpCombinationPattern(patterns); + if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { + parentOp->emitOpError("failed to compose strided operations"); + return signalPassFailure(); + } + + IRRewriter rewriter(parentOp->getContext()); + if (failed(moveNpuDmaSyncUsersAfterAncestorInSameBlock(rewriter, parentOp))) { + parentOp->emitOpError() << "failed to move DMA users to correct scope " + "after strided op composition"; + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr createAMDAIEDmaCompositionPass( + AMDAIEDmaCompositionOptions options) { + return std::make_unique(options); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 112a6b40e..e3ecfef90 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -25,6 +25,7 @@ #include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/AMDAIEUtils.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -49,15 +50,6 @@ int64_t calculateNbIterations(int64_t lowerBound, int64_t upperBound, namespace { -/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor. -Operation *getAncestorInBlock(Operation *op, Block *block) { - if (!op || !block) return nullptr; - auto parent = op; - while (parent && (parent->getBlock() != block)) - parent = parent->getParentOp(); - return parent; -} - /// Utility affine expression visitor to retrieve the scale and optional bias /// from the expression. struct RetrieveScaleAndBias @@ -112,31 +104,6 @@ struct RetrieveScaleAndBias } }; -/// Utility to clean up the DMA users after loop subsumption + hoisting. This -/// will hoist `amdaie.npu.dma_cpy_nd`'s users like `npu.dma_wait` as well. -LogicalResult moveUsersToHoistedDMAScope(Operation *parentOp) { - IRRewriter rewriter(parentOp->getContext()); - // Move `amdaie.npu.dma_wait` operation after the parent op in the same block - // as the input `amdaie.npu.dma_cpy_nd` operation. This parent op will - // typically be a loop out of which the DMA operation has been hoisted. Moving - // the wait operation after this loop is important to avoid a deadlock with - // whatever operations are still remaining inside the loop's scope. - WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) { - Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp(); - Operation *ancestorInSameBlock = - getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock()); - if (!ancestorInSameBlock) { - npuDmaWaitOp->emitOpError( - "doesn't have an ancestor in the same scope as the source DMA op"); - return WalkResult::interrupt(); - } - rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock); - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - struct SubsumeLoopIntoDMA : public OpInterfaceRewritePattern { using OpInterfaceRewritePattern::OpInterfaceRewritePattern; @@ -594,7 +561,7 @@ class AMDAIEDmaLoopSubsumptionPass } AMDAIEDmaLoopSubsumptionPass() = default; - AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass) {}; + AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionPass &pass){}; AMDAIEDmaLoopSubsumptionPass(const AMDAIEDmaLoopSubsumptionOptions &options) : AMDAIEDmaLoopSubsumptionBase(options) {} void runOnOperation() override; @@ -605,7 +572,6 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() { MLIRContext *context = &getContext(); RewritePatternSet patterns(context); - { auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); @@ -619,11 +585,8 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() { } AMDAIE::AMDAIEDeviceModel deviceModel = AMDAIE::getDeviceModel(maybeDevice.value()); - - SubsumeLoopIntoDMA pattern(context, std::move(deviceModel), - onlyZeroStrideOnOuterDim); - - patterns.insert(std::move(pattern)); + populateDmaLoopSubsumptionPattern(patterns, std::move(deviceModel), + onlyZeroStrideOnOuterDim); } if (failed(applyPatternsAndFoldGreedily(parentOp, std::move(patterns)))) { @@ -631,7 +594,8 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() { return signalPassFailure(); } - if (failed(moveUsersToHoistedDMAScope(parentOp))) { + IRRewriter rewriter(parentOp->getContext()); + if (failed(moveNpuDmaSyncUsersAfterAncestorInSameBlock(rewriter, parentOp))) { parentOp->emitOpError( "failed to move DMA users to correct scope after loop subsumption"); return signalPassFailure(); @@ -640,6 +604,14 @@ void AMDAIEDmaLoopSubsumptionPass::runOnOperation() { } // namespace +void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns, + AMDAIE::AMDAIEDeviceModel &&deviceModel, + bool onlyZeroStrideOnOuterDim) { + SubsumeLoopIntoDMA pattern(patterns.getContext(), std::move(deviceModel), + onlyZeroStrideOnOuterDim); + patterns.insert(std::move(pattern)); +} + std::unique_ptr createAMDAIEDmaLoopSubsumptionPass( AMDAIEDmaLoopSubsumptionOptions options) { return std::make_unique(options); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp index b66ac71fa..1ff0b3eec 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp @@ -12,6 +12,15 @@ namespace mlir::iree_compiler::AMDAIE { +/// Return an ancestor of 'op' in 'block', or nullptr if no such ancestor. +Operation *getAncestorInBlock(Operation *op, Block *block) { + if (!op || !block) return nullptr; + auto parent = op; + while (parent && (parent->getBlock() != block)) + parent = parent->getParentOp(); + return parent; +} + /// Utility to retrieve a constant index from an OpFoldResult. int64_t getConstantIndexOrAssert(OpFoldResult dim) { std::optional size = getConstantIntValue(dim); @@ -317,4 +326,22 @@ LogicalResult foldUnitDims(const SmallVector &offsets, return success(foldableUnitDimsFound); } +LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( + RewriterBase &rewriter, Operation *parentOp) { + WalkResult res = parentOp->walk([&](AMDAIE::NpuDmaWaitOp npuDmaWaitOp) { + Operation *dmaOp = npuDmaWaitOp.getDma().getDefiningOp(); + Operation *ancestorInSameBlock = + getAncestorInBlock(npuDmaWaitOp, dmaOp->getBlock()); + if (!ancestorInSameBlock) { + npuDmaWaitOp->emitOpError( + "doesn't have an ancestor in the same scope as the source DMA op"); + return WalkResult::interrupt(); + } + rewriter.moveOpAfter(npuDmaWaitOp, ancestorInSameBlock); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h index fa55f74d4..e628cc739 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.h @@ -9,6 +9,7 @@ #include "iree-amd-aie/IR/AMDAIEAttrs.h" #include "iree-amd-aie/IR/AMDAIEDmaOpInterface.h" +#include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/SmallVector.h" #include "mlir/IR/MLIRContext.h" @@ -301,6 +302,15 @@ struct DmaDimConfig { } }; +/// Utility to move the synchronization users (`amdaie.npu.dma_wait`) directly +/// after its ancestor in the same block as the DMA operation it's synchronizing +/// on. This utility can be used for cleanup after DMA transformations to avoid +/// deadlocks and/or ensure SSA dominance. The idea is to ensure correct +/// synchronization by not influencing whatever is happening in between the +/// async DMA operation and its synchronization op. +LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( + RewriterBase &rewriter, Operation *parentOp); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 05338ea55..d2f21f7d5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -60,6 +60,7 @@ iree_cc_library( "AMDAIECreateLogicalObjectFifoLink.cpp" "AMDAIECreateReferenceToAllocation.cpp" "AMDAIEDistributeCoresAndObjectFifos.cpp" + "AMDAIEDmaComposition.cpp" "AMDAIEDmaLoopSubsumption.cpp" "AMDAIEDmaToCircularDma.cpp" "AMDAIEDmaUtils.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index abc75e0f4..9ef92c268 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -39,6 +39,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECREATEREFERENCETOALLOCATION #define GEN_PASS_DEF_AMDAIEDECOMPOSELINALGEXTPACKUNPACKTOAIR #define GEN_PASS_DEF_AMDAIEDISTRIBUTECORESANDOBJECTFIFOS +#define GEN_PASS_DEF_AMDAIEDMACOMPOSITION #define GEN_PASS_DEF_AMDAIEDMALOOPSUBSUMPTION #define GEN_PASS_DEF_AMDAIEDMATOCIRCULARDMA #define GEN_PASS_DEF_AMDAIEFLATTENLOGICALOBJECTFIFO diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index f1ac91a26..90b1bcfd8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -608,11 +608,7 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createAMDAIEDmaLoopSubsumptionPass()); - passManager.addPass(createCSEPass()); - passManager.addPass(createCanonicalizerPass()); - - passManager.addPass(createAMDAIECombineStridedOpsPass()); + passManager.addPass(createAMDAIEDmaCompositionPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 59a875330..0a901bd5b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -122,6 +122,12 @@ std::unique_ptr createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass(); /// operations and distribute the logical objectFifos. std::unique_ptr createAMDAIEDistributeCoresAndObjectFifosPass(); +/// Create a pass to compose more complex DMA operations, e.g. by combining DMA +/// operations and/or subsuming loop iterations into the strided access +/// patterns. +std::unique_ptr createAMDAIEDmaCompositionPass( + AMDAIEDmaCompositionOptions options = {}); + /// Create a pass to subsume loop iterations into DMA operations' access /// patterns. std::unique_ptr createAMDAIEDmaLoopSubsumptionPass( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index c1ec4c15c..3da414445 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -173,6 +173,17 @@ def AMDAIEDistributeCoresAndObjectFifos : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDistributeCoresAndObjectFifosPass()"; } +def AMDAIEDmaComposition : + Pass<"iree-amdaie-dma-composition"> { + let summary = "Compose DMA operations by DMA combination and loop subsumption."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEDmaCompositionPass()"; + let options = [ + Option<"onlyZeroStrideOnOuterDim", "only-zero-stride-on-outer-dim", "bool", /*default=*/"true", + "Whether a stride of zero indicating a repeat is only supported on the " + "outer dimension. This is the case of AIE2(+)."> + ]; +} + def AMDAIEDmaLoopSubsumption : Pass<"iree-amdaie-dma-loop-subsumption"> { let summary = "Subsume loop iterations into DMA operations' access patterns."; @@ -459,7 +470,6 @@ def AMDAIESinkIntoCore : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESinkIntoCorePass()"; } - def AMDAIETile : InterfacePass<"iree-amdaie-tile", "mlir::FunctionOpInterface"> { let summary = "Pass to tile TilingInterface operations."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h index 41c5a9d20..0b805d8d6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h @@ -8,6 +8,7 @@ #define IREE_AMD_AIE_TRANSFORMS_AMDAIETRANSFORMS_H_ #include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -38,6 +39,14 @@ LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForOp forOp); LogicalResult normalizeLoopBounds(RewriterBase &rewriter, scf::ForallOp forallOp); +/// Populate patterns that subsume loops iterations into DMA access patterns. +void populateDmaLoopSubsumptionPattern(RewritePatternSet &patterns, + AMDAIE::AMDAIEDeviceModel &&deviceModel, + bool onlyZeroStrideOnOuterDim); + +/// Populate patterns that combine strided ops in the same block. +void populateStridedOpCombinationPattern(RewritePatternSet &patterns); + } // namespace mlir::iree_compiler::AMDAIE #endif diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index affb368c6..319166c69 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -27,6 +27,7 @@ iree_lit_test_suite( "create_reference_to_allocation.mlir" "disable_vectorization.mlir" "distribute_cores_and_objectfifos.mlir" + "dma_composition.mlir" "dma_loop_subsumption.mlir" "dma_to_circular_dma.mlir" "flatten_logical_objectfifo.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_composition.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_composition.mlir new file mode 100644 index 000000000..36a0d8b1e --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_composition.mlir @@ -0,0 +1,145 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-dma-composition,canonicalize))" --split-input-file --verify-diagnostics %s | FileCheck %s + +module { + // expected-error @+1 {{has no AMDAIEDevice in the target attribute configuration}} + func.func @no_amdaie_device(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %1 = amdaie.npu.dma_cpy_nd %0([] [] [], [] [] []) + amdaie.end + } + } + return + } +} + +// ----- + +//===----------------------------------------------------------------------===// +// Sanity checks for cases where no modification should happen. +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: @diff_circular_dmas +// CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([] [] [], [0] [16] [1]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([] [] [], [32] [16] [1]) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @diff_circular_dmas(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.circular_dma_cpy_nd(%arg0[0] [32] [1], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0] [16] [1]) + %3 = amdaie.npu.dma_cpy_nd %1([] [] [], [32] [16] [1]) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-LABEL: @no_combination_or_subsumption +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: scf.for %[[ARG2:.+]] = +// CHECK: %[[APPLY:.+]] = affine.apply #[[$MAP]](%[[ARG2]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY]], 0] [8, 16, 8, 16] [8, 32, 8, 1]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY]], 32] [8, 16, 8, 16] [8, 32, 8, 1]) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_combination_or_subsumption(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1, 0] [8, 16, 8, 16] [8, 32, 8, 1]) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, %1, 32] [8, 16, 8, 16] [8, 32, 8, 1]) + } + amdaie.end + } + } + return + } +} + +// ----- + +//===----------------------------------------------------------------------===// +// Checks in which composition should happen. +//===----------------------------------------------------------------------===// + +// CHECK-NOT: affine_map +// CHECK-LABEL: @combination_and_subsumption +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [6, 2, 8, 16] [128, 32, 8, 1]) +// CHECK-NOT: amdaie.npu.dma_cpy_nd +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @combination_and_subsumption(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [%1, 0] [8, 16] [8, 1]) + amdaie.npu.dma_wait(%2, MM2S) + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [%1, 32] [8, 16] [8, 1]) + amdaie.npu.dma_wait(%3, MM2S) + } + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-NOT: affine_map +// CHECK-LABEL: @subsumption_and_combination +// CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK-NOT: scf.for +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [2, 6, 8, 16] [32, 32, 8, 1]) +// CHECK-NOT: amdaie.npu.dma_cpy_nd +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) +#map = affine_map<(d0) -> (d0 * 32)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @subsumption_and_combination(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + scf.for %arg2 = %c0 to %c6 step %c1 { + %1 = affine.apply #map(%arg2) + %2 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, %1] [8, 16] [8, 1]) + amdaie.npu.dma_wait(%2, MM2S) + } + %3 = amdaie.npu.dma_cpy_nd %0([] [] [], [0, 0, 32] [6, 8, 16] [32, 8, 1]) + amdaie.npu.dma_wait(%3, MM2S) + amdaie.end + } + } + return + } +}