-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[mlir] Add optimization to bubbleUpPadSlice pattern for no pad case #135859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir] Add optimization to bubbleUpPadSlice pattern for no pad case #135859
Conversation
Signed-off-by: Nirvedh <nirvedh@gmail.com>
@llvm/pr-subscribers-mlir-tensor @llvm/pr-subscribers-mlir Author: Nirvedh Meshram (nirvedhmeshram) ChangesIn cases where there is no padding on a dim, we do not need to compute new offsets, lengths and padding, for example the new test case added can just be lowered to
with this PR but without this PR we will have affine maps like
which are unnecessary Full diff: https://github.com/llvm/llvm-project/pull/135859.diff 3 Files Affected:
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 138e4be6b18e9..7778a02dbeaf4 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -122,7 +122,7 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
OpFoldResult zero = b.getIndexAttr(0);
// Compute new offsets, lengths, low padding, high padding.
- SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+ SmallVector<OpFoldResult> newOffsets, newLengths;
SmallVector<OpFoldResult> newLows, newHighs;
// Set to true if the original data source is not read at all.
bool hasZeroLen = false;
@@ -131,6 +131,8 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
Value dynHasZeroLenCond;
int64_t rank = padOp.getSourceType().getRank();
+ // Only unit stride supported.
+ SmallVector<OpFoldResult> newStrides(rank, b.getIndexAttr(1));
for (unsigned dim = 0; dim < rank; ++dim) {
auto low = padOp.getMixedLowPad()[dim];
bool hasLowPad = !isConstantIntValue(low, 0);
@@ -138,6 +140,16 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
bool hasHighPad = !isConstantIntValue(high, 0);
auto offset = offsets[dim];
auto length = sizes[dim];
+ // If the dim has no padding, we dont need to calculate new values for that
+ // dim as the exisiting ones are correct even after the pattern.
+ if (!hasLowPad && !hasHighPad) {
+ newOffsets.push_back(offset);
+ newLengths.push_back(length);
+ newLows.push_back(low);
+ newHighs.push_back(high);
+ continue;
+ }
+
auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim);
// The new amount of low padding is `low - offset`. Except for the case
@@ -216,9 +228,6 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
OpFoldResult newHigh =
hasHighPad ? sub(sub(length, newLength), newLow) : zero;
newHighs.push_back(newHigh);
-
- // Only unit stride supported.
- newStrides.push_back(b.getIndexAttr(1));
}
// The shape of the result can be obtained from the sizes passed in.
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index d6c400dcbf2b9..6cab25b50460d 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -7,17 +7,17 @@
// CHECK: scf.for
// CHECK: memref.alloc() : memref<128x16xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK: vector.constant_mask [16, 4] : vector<128x4xi1>
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: memref.alloc() : memref<16x128xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK: vector.constant_mask [16, 4] : vector<128x4xi1>
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: memref.alloc() : memref<128x128xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK-NOT: mask
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: linalg.matmul
diff --git a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
index b4417641c9f83..d43b9a7ac6c04 100644
--- a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
+++ b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
@@ -216,3 +216,20 @@ func.func @dynamic_zero_high_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
%1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
return %1 : tensor<?x?xf32>
}
+
+// -----
+// CHECK-LABEL: @nopaddim_with_dynamic_extract(
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x4x5xf32>
+// CHECK-SAME: %[[ARG1:.*]]: f32
+// CHECK-SAME: %[[ARG2:.*]]: index
+// CHECK: %[[RESULT:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 1, 2] [%[[ARG2]], 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
+// CHECK: return %[[RESULT]]
+func.func @nopaddim_with_dynamic_extract(%arg0 : tensor<3x4x5xf32>, %pad : f32, %index : index)
+ -> tensor<?x2x1xf32> {
+ %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 8] {
+ ^bb0(%arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %pad : f32
+ } : tensor<3x4x5xf32> to tensor<3x11x13xf32>
+ %1 = tensor.extract_slice %0[%index, 1, 2] [%index, 2, 1] [1, 1, 1] : tensor<3x11x13xf32> to tensor<?x2x1xf32>
+ return %1 : tensor<?x2x1xf32>
+}
|
@llvm/pr-subscribers-mlir-linalg Author: Nirvedh Meshram (nirvedhmeshram) ChangesIn cases where there is no padding on a dim, we do not need to compute new offsets, lengths and padding, for example the new test case added can just be lowered to
with this PR but without this PR we will have affine maps like
which are unnecessary Full diff: https://github.com/llvm/llvm-project/pull/135859.diff 3 Files Affected:
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 138e4be6b18e9..7778a02dbeaf4 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -122,7 +122,7 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
OpFoldResult zero = b.getIndexAttr(0);
// Compute new offsets, lengths, low padding, high padding.
- SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+ SmallVector<OpFoldResult> newOffsets, newLengths;
SmallVector<OpFoldResult> newLows, newHighs;
// Set to true if the original data source is not read at all.
bool hasZeroLen = false;
@@ -131,6 +131,8 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
Value dynHasZeroLenCond;
int64_t rank = padOp.getSourceType().getRank();
+ // Only unit stride supported.
+ SmallVector<OpFoldResult> newStrides(rank, b.getIndexAttr(1));
for (unsigned dim = 0; dim < rank; ++dim) {
auto low = padOp.getMixedLowPad()[dim];
bool hasLowPad = !isConstantIntValue(low, 0);
@@ -138,6 +140,16 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
bool hasHighPad = !isConstantIntValue(high, 0);
auto offset = offsets[dim];
auto length = sizes[dim];
+ // If the dim has no padding, we dont need to calculate new values for that
+ // dim as the exisiting ones are correct even after the pattern.
+ if (!hasLowPad && !hasHighPad) {
+ newOffsets.push_back(offset);
+ newLengths.push_back(length);
+ newLows.push_back(low);
+ newHighs.push_back(high);
+ continue;
+ }
+
auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim);
// The new amount of low padding is `low - offset`. Except for the case
@@ -216,9 +228,6 @@ FailureOr<TilingResult> tensor::bubbleUpPadSlice(OpBuilder &b,
OpFoldResult newHigh =
hasHighPad ? sub(sub(length, newLength), newLow) : zero;
newHighs.push_back(newHigh);
-
- // Only unit stride supported.
- newStrides.push_back(b.getIndexAttr(1));
}
// The shape of the result can be obtained from the sizes passed in.
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index d6c400dcbf2b9..6cab25b50460d 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -7,17 +7,17 @@
// CHECK: scf.for
// CHECK: memref.alloc() : memref<128x16xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK: vector.constant_mask [16, 4] : vector<128x4xi1>
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: memref.alloc() : memref<16x128xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK: vector.constant_mask [16, 4] : vector<128x4xi1>
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: memref.alloc() : memref<128x128xf32, 3>
// CHECK: scf.forall
-// CHECK: vector.create_mask
+// CHECK-NOT: mask
// CHECK: vector.transfer_read
// CHECK: vector.transfer_write
// CHECK: linalg.matmul
diff --git a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
index b4417641c9f83..d43b9a7ac6c04 100644
--- a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
+++ b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
@@ -216,3 +216,20 @@ func.func @dynamic_zero_high_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
%1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
return %1 : tensor<?x?xf32>
}
+
+// -----
+// CHECK-LABEL: @nopaddim_with_dynamic_extract(
+// CHECK-SAME: %[[ARG0:.*]]: tensor<3x4x5xf32>
+// CHECK-SAME: %[[ARG1:.*]]: f32
+// CHECK-SAME: %[[ARG2:.*]]: index
+// CHECK: %[[RESULT:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 1, 2] [%[[ARG2]], 2, 1] [1, 1, 1] : tensor<3x4x5xf32> to tensor<?x2x1xf32>
+// CHECK: return %[[RESULT]]
+func.func @nopaddim_with_dynamic_extract(%arg0 : tensor<3x4x5xf32>, %pad : f32, %index : index)
+ -> tensor<?x2x1xf32> {
+ %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 8] {
+ ^bb0(%arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %pad : f32
+ } : tensor<3x4x5xf32> to tensor<3x11x13xf32>
+ %1 = tensor.extract_slice %0[%index, 1, 2] [%index, 2, 1] [1, 1, 1] : tensor<3x11x13xf32> to tensor<?x2x1xf32>
+ return %1 : tensor<?x2x1xf32>
+}
|
In cases where there is no padding on a dim, we do not need to compute new offsets, lengths and padding, for example the new test case added can just be lowered to
without this PR we will have affine maps like
which are unnecessary