-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[mlir][memref] Update tests to use memref.assume_alignment properly. #142358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
- Update all the lit tests to use the result of memref.assume_alignment, if it is present. - Capture the result of the op in lit tests. Signed-off-by: hanhanW <hanhan0912@gmail.com>
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-memref Author: Han-Chung Wang (hanhanW) ChangesWith ffb9bbf, memref.assume_alignment op returns a result value. The revision updates the tests to reflect the change:
Full diff: https://github.com/llvm/llvm-project/pull/142358.diff 5 Files Affected:
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 8c863bb2d3d65..acfc188574255 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -189,7 +189,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
// CHECK-NEXT: %[[ALIGN:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK-NEXT: llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i64)] : i1
// CHECK-INTERFACE: llvm.intr.assume
- memref.assume_alignment %0, 16 : memref<4x4xf16>
+ %1 = memref.assume_alignment %0, 16 : memref<4x4xf16>
return
}
@@ -205,7 +205,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
// CHECK-DAG: %[[ALIGN:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK-NEXT: llvm.intr.assume %[[TRUE]] ["align"(%[[BUFF_ADDR]], %[[ALIGN]] : !llvm.ptr, i64)] : i1
// CHECK-INTERFACE: llvm.intr.assume
- memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
+ %1 = memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
return
}
// -----
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index 111a02abcc74c..3378d329e8205 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -63,7 +63,7 @@ func.func @memref_load_i4(%arg0: index) -> i4 {
func.func @memref_load_i4_rank2(%arg0: index, %arg1: index) -> i4 {
%0 = memref.alloc() : memref<3x125xi4>
- %align0 =memref.assume_alignment %0, 64 : memref<3x125xi4>
+ %align0 = memref.assume_alignment %0, 64 : memref<3x125xi4>
%1 = memref.load %align0[%arg0,%arg1] : memref<3x125xi4>
return %1 : i4
}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 38ee363a7d424..13fdf3cf13510 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -283,7 +283,7 @@ func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) {
// CHECK-LABEL: func @assume_alignment
// CHECK-SAME: %[[MEMREF:.*]]: memref<4x4xf16>
func.func @assume_alignment(%0: memref<4x4xf16>) {
- // CHECK: memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16>
+ // CHECK: %{{.*}} = memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16>
%1 = memref.assume_alignment %0, 16 : memref<4x4xf16>
return
}
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
index aaa3aff5350ad..6153a11622a4f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
@@ -120,7 +120,7 @@ func.func @main() {
threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1)
dynamic_shared_memory_size %shmemSize
{
- memref.assume_alignment %matrixD, 16 : memref<128x128xf32>
+ %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32>
%c256 = arith.constant 256 : index
%c10000000 = arith.constant 10000000 : index
@@ -226,13 +226,13 @@ func.func @main() {
scf.for %arg12 = %17 to %c128 step %c4 {
%19 = arith.muli %18, %c4 : index
%20 = vector.load %accShmemPtr[%arg12, %19] : memref<128x128xf32, 3>, vector<4xf32>
- vector.store %20, %matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32>
+ vector.store %20, %align_matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32>
}
gpu.terminator
}
// Step 5. Copy D2H
- %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32>
+ %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32>
gpu.wait [%token]
// Step 6. Compute on host
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
index b257d2b0f1e34..b8e355712d37f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
@@ -120,7 +120,7 @@ func.func @main() {
threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1)
dynamic_shared_memory_size %shmemSize
{
- memref.assume_alignment %matrixD, 16 : memref<128x128xf32>
+ %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32>
%c256 = arith.constant 256 : index
%c10000000 = arith.constant 10000000 : index
@@ -234,13 +234,13 @@ func.func @main() {
scf.for %arg12 = %17 to %c128 step %c4 {
%19 = arith.muli %18, %c4 : index
%20 = vector.load %accShmemPtr[%arg12, %19] : memref<128x128xf32, 3>, vector<4xf32>
- vector.store %20, %matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32>
+ vector.store %20, %align_matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32>
}
gpu.terminator
}
// Step 5. Copy D2H
- %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32>
+ %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32>
gpu.wait [%token]
// Step 6. Compute on host
|
I don't know why llvm-project/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td Lines 165 to 167 in 1984c75
|
Signed-off-by: hanhanW <hanhan0912@gmail.com>
} | ||
gpu.terminator | ||
} | ||
|
||
// Step 5. Copy D2H | ||
%5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> | ||
%5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if we want to use align_matrixD
here or not, so I move the declaration outside the GPU kernel block. I'm happy to move it back, if my change is not correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need align here. I mentioned line above where we need
(I don't have a local setup for nvgpu, so I can only test it here.) |
I can't add @shay-kl to reviewers, so I tag you instead. |
Don't have enough expertise to comment on the specifics of the GPU interaction, but otherwise LGTM. |
@@ -226,13 +226,13 @@ func.func @main() { | |||
scf.for %arg12 = %17 to %c128 step %c4 { | |||
%19 = arith.muli %18, %c4 : index | |||
%20 = vector.load %accShmemPtr[%arg12, %19] : memref<128x128xf32, 3>, vector<4xf32> | |||
vector.store %20, %matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> | |||
vector.store %20, %align_matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we only need aligned matrix, so we can generate vectorized store here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks!
} | ||
gpu.terminator | ||
} | ||
|
||
// Step 5. Copy D2H | ||
%5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> | ||
%5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need align here. I mentioned line above where we need
Signed-off-by: hanhanW <hanhan0912@gmail.com>
…lvm#142358) With llvm@ffb9bbf, memref.assume_alignment op returns a result value. The revision updates the tests to reflect the change: - Update all the lit tests to use the result of memref.assume_alignment, if it is present. - Capture the result of the op in lit tests. --------- Signed-off-by: hanhanW <hanhan0912@gmail.com>
With ffb9bbf, memref.assume_alignment op returns a result value. The revision updates the tests to reflect the change: