Skip to content

Commit

Permalink
squashed commit
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Aug 15, 2024
1 parent 8b9c73e commit 93b20b1
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 300 deletions.
77 changes: 71 additions & 6 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ LogicalResult ControlCodeOp::verify() {
// AMDAIE_CoreOp
//===----------------------------------------------------------------------===//


void CoreOp::build(OpBuilder &b, OperationState &result, AMDAIE::TileOp tileOp,
ValueRange inputDmas, ValueRange outputDmas) {
build(b, result, b.getIndexType(), tileOp, inputDmas, outputDmas, nullptr);
Expand Down Expand Up @@ -386,8 +385,7 @@ void LogicalObjectFifoFromMemrefOp::build(
for (auto [column, row] : tileLocations) {
auto getCol = b.create<arith::ConstantIndexOp>(b.getUnknownLoc(), column);
auto getRow = b.create<arith::ConstantIndexOp>(b.getUnknownLoc(), row);
auto tileOp =
b.create<AMDAIE::TileOp>(b.getUnknownLoc(), getCol, getRow);
auto tileOp = b.create<AMDAIE::TileOp>(b.getUnknownLoc(), getCol, getRow);
tiles.push_back(tileOp.getResult());
}
// For deterministic order.
Expand All @@ -397,6 +395,74 @@ void LogicalObjectFifoFromMemrefOp::build(
build(b, result, type, memref, tiles);
}

LogicalResult NpuDmaCpyNdOp::canonicalize(NpuDmaCpyNdOp dmaOp,
PatternRewriter &rewriter) {
// First check if any of offsets, sizes or strides are constant operands which
// can be made static.
auto canFold = [&](ArrayRef<OpFoldResult> mixed,
ArrayRef<int64_t> statics) -> bool {
for (uint64_t i = 0; i < statics.size(); ++i) {
if (statics[i] == ShapedType::kDynamic) {
auto maybeConstant = getConstantIntValue(mixed[i]);
if (maybeConstant.has_value()) return true;
}
}
return false;
};

if (!canFold(dmaOp.getSourceMixedOffsets(), dmaOp.getSourceStaticOffsets()) &&
!canFold(dmaOp.getSourceMixedSizes(), dmaOp.getSourceStaticSizes()) &&
!canFold(dmaOp.getSourceMixedStrides(), dmaOp.getSourceStaticStrides()) &&
!canFold(dmaOp.getTargetMixedOffsets(), dmaOp.getTargetStaticOffsets()) &&
!canFold(dmaOp.getTargetMixedSizes(), dmaOp.getTargetStaticSizes()) &&
!canFold(dmaOp.getTargetMixedStrides(), dmaOp.getTargetStaticStrides())) {
return failure();
}

// Make the change, creating new static and dynamic dimensions for all.
auto getNew = [&](ArrayRef<OpFoldResult> mixed, ArrayRef<int64_t> statics)
-> std::tuple<SmallVector<int64_t>, SmallVector<Value>> {
SmallVector<int64_t> newStatics;
SmallVector<Value> newDynamics;
for (uint64_t i = 0; i < statics.size(); ++i) {
if (statics[i] == ShapedType::kDynamic) {
auto maybeConstant = getConstantIntValue(mixed[i]);
if (maybeConstant.has_value()) {
newStatics.push_back(maybeConstant.value());
} else {
newStatics.push_back(ShapedType::kDynamic);
newDynamics.push_back(mixed[i].get<Value>());
}
} else {
newStatics.push_back(statics[i]);
}
}
return {newStatics, newDynamics};
};

auto [srcOffsetStatic, srcOffsetValues] =
getNew(dmaOp.getSourceMixedOffsets(), dmaOp.getSourceStaticOffsets());
auto [srcSizeStatic, srcSizeValues] =
getNew(dmaOp.getSourceMixedSizes(), dmaOp.getSourceStaticSizes());
auto [srcStrideStatic, srcStrideValues] =
getNew(dmaOp.getSourceMixedStrides(), dmaOp.getSourceStaticStrides());

auto [tgtOffsetStatic, tgtOffsetValues] =
getNew(dmaOp.getTargetMixedOffsets(), dmaOp.getTargetStaticOffsets());
auto [tgtSizeStatic, tgtSizeValues] =
getNew(dmaOp.getTargetMixedSizes(), dmaOp.getTargetStaticSizes());
auto [tgtStrideStatic, tgtStrideValues] =
getNew(dmaOp.getTargetMixedStrides(), dmaOp.getTargetStaticStrides());

rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
dmaOp, dmaOp.getDma(), tgtOffsetValues, tgtSizeValues, tgtStrideValues,
tgtOffsetStatic, tgtSizeStatic, tgtStrideStatic, srcOffsetValues,
srcSizeValues, srcStrideValues, srcOffsetStatic, srcSizeStatic,
srcStrideStatic, dmaOp.getTargetBdId(), dmaOp.getSourceBdId());

return success();
}

LogicalResult LogicalObjectFifoFromMemrefOp::canonicalize(
LogicalObjectFifoFromMemrefOp logicalObjectFifo,
PatternRewriter &rewriter) {
Expand Down Expand Up @@ -449,8 +515,8 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result,
// AMDAIE_NpuDmaCpyNdOp
//===----------------------------------------------------------------------===//

// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and
// source BD IDs.
// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target
// and source BD IDs.
void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma,
ArrayRef<OpFoldResult> targetOffsets,
ArrayRef<OpFoldResult> targetSizes,
Expand Down Expand Up @@ -645,5 +711,4 @@ LogicalResult WorkgroupOp::verify() {
}
return success();
}

} // namespace mlir::iree_compiler::AMDAIE
6 changes: 6 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,13 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd",
::llvm::SmallVector<::mlir::OpFoldResult>& newSourceOffsets,
::llvm::SmallVector<::mlir::OpFoldResult>& newSourceSizes,
::llvm::SmallVector<::mlir::OpFoldResult>& newSourceStrides);

}];

// Ensure that dimensions of offsets/sizes/strides that can be static, are.
// TODO(newling) make this a canonicalization for all doubly strided ops.
let hasCanonicalizeMethod = 1;

}

def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,12 +280,8 @@ func.func @dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<m
// CHECK-LABEL: func.func @npu_dma_cpy_nd_source
// CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
// CHECK-SAME: %[[ARG1:.+]]: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] [])
// CHECK: amdaie.npu.dma_cpy_nd %[[DMA0]]([%[[C0]]] [%[[C128]]] [%[[C1]]], [%[[C0]]] [%[[C64]]] [%[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %[[DMA0]]([0] [128] [1], [0] [64] [1])

// FOLD-SINGLE-DIMS-LABEL: func.func @npu_dma_cpy_nd_source
// FOLD-SINGLE-DIMS-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
Expand All @@ -301,11 +297,7 @@ func.func @npu_dma_cpy_nd_source(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x1
// -----

// CHECK-LABEL: func.func @npu_dma_cpy_nd_linear_implicit
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C0]]] [%[[C128]]] [%[[C1]]], [%[[C0]]] [%[[C64]]] [%[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([0] [128] [1], [0] [64] [1])
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([] [] [], [] [] [])
func.func @npu_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
Expand All @@ -316,20 +308,8 @@ func.func @npu_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo<memre
// -----

// CHECK-LABEL: func.func @npu_dma_cpy_nd_linear
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C0]], %[[C0]]] [%[[C16]], %[[C8]]] [%[[C16]], %[[C1]]], [%[[C0]], %[[C0]], %[[C0]]] [%[[C64]], %[[C16]], %[[C128]]] [%[[C128]], %[[C16]], %[[C1]]])
// FOLD-SINGLE-DIMS-DAG: %[[C0:.+]] = arith.constant 0 : index
// FOLD-SINGLE-DIMS-DAG: %[[C1:.+]] = arith.constant 1 : index
// FOLD-SINGLE-DIMS-DAG: %[[C8:.+]] = arith.constant 8 : index
// FOLD-SINGLE-DIMS-DAG: %[[C16:.+]] = arith.constant 16 : index
// FOLD-SINGLE-DIMS-DAG: %[[C64:.+]] = arith.constant 64 : index
// FOLD-SINGLE-DIMS-DAG: %[[C128:.+]] = arith.constant 128 : index
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C0]], %[[C0]]] [%[[C16]], %[[C8]]] [%[[C16]], %[[C1]]], [%[[C0]], %[[C0]], %[[C0]]] [%[[C64]], %[[C16]], %[[C128]]] [%[[C128]], %[[C16]], %[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([0, 0] [16, 8] [16, 1], [0, 0, 0] [64, 16, 128] [128, 16, 1])
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([0, 0] [16, 8] [16, 1], [0, 0, 0] [64, 16, 128] [128, 16, 1])
func.func @npu_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
%c16 = arith.constant 16 : index
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
Expand All @@ -351,19 +331,8 @@ func.func @npu_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x
// -----

// CHECK-LABEL: func.func @npu_dma_cpy_nd_unit
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C0]]] [%[[C128]]] [%[[C1]]], [%[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C8]], %[[C8]]] [%[[C8]], %[[C16]], %[[C1]]])
// FOLD-SINGLE-DIMS-DAG: %[[C0:.+]] = arith.constant 0 : index
// FOLD-SINGLE-DIMS-DAG: %[[C1:.+]] = arith.constant 1 : index
// FOLD-SINGLE-DIMS-DAG: %[[C2:.+]] = arith.constant 2 : index
// FOLD-SINGLE-DIMS-DAG: %[[C8:.+]] = arith.constant 8 : index
// FOLD-SINGLE-DIMS-DAG: %[[C16:.+]] = arith.constant 16 : index
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([] [] [], [%[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C8]], %[[C8]]] [%[[C8]], %[[C16]], %[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([0] [128] [1], [0, 0, 0] [2, 8, 8] [8, 16, 1])
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([] [] [], [0, 0, 0] [2, 8, 8] [8, 16, 1])
func.func @npu_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>) {
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x2x2x4x8xi32, 1>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
%1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], [0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1])
Expand All @@ -373,11 +342,7 @@ func.func @npu_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo<memref<1x1x2x2x4
// -----

// CHECK-LABEL: func.func @npu_dma_cpy_nd_unit_between_linear
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK-DAG: %[[C8192:.+]] = arith.constant 8192 : index
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C0]]] [%[[C8192]]] [%[[C1]]], [%[[C0]]] [%[[C128]]] [%[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([0] [8192] [1], [0] [128] [1])
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([] [] [], [] [] [])
func.func @npu_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
Expand All @@ -399,14 +364,8 @@ func.func @npu_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memre
// -----

// CHECK-LABEL: func.func @npu_dma_cpy_nd_partial_non_zero_offset
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C1]]] [%[[C128]]] [%[[C1]]], [%[[C1]]] [%[[C64]]] [%[[C1]]])
// FOLD-SINGLE-DIMS-DAG: %[[C1:.+]] = arith.constant 1 : index
// FOLD-SINGLE-DIMS-DAG: %[[C64:.+]] = arith.constant 64 : index
// FOLD-SINGLE-DIMS-DAG: %[[C128:.+]] = arith.constant 128 : index
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([%[[C1]]] [%[[C128]]] [%[[C1]]], [%[[C1]]] [%[[C64]]] [%[[C1]]])
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([1] [128] [1], [1] [64] [1])
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([1] [128] [1], [1] [64] [1])
func.func @npu_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
%1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], [0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1])
Expand Down
Loading

0 comments on commit 93b20b1

Please sign in to comment.