[AMDGPU] Produce ballot/icmp/fcmp lane masks at wavefront width#201358
[AMDGPU] Produce ballot/icmp/fcmp lane masks at wavefront width#201358aobolensk wants to merge 8 commits into
Conversation
Reading read_register with the return type and a hardcoded "exec" name crashed isel on a width mismatch Read EXEC at wave-size width with the matching register name, then zext/trunc to the actual return type
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: Arseniy Obolenskiy (aobolensk) ChangesReading read_register with the return type and a hardcoded "exec" name crashed isel on a width mismatch Read EXEC at wave-size width with the matching register name, then zext/trunc to the actual return type Full diff: https://github.com/llvm/llvm-project/pull/201358.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2370c379e75f5..8457b2594d3fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1644,15 +1644,19 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// intrinsic exposes) is one bit per thread, masked with the EXEC
// register (which contains the bitmask of live threads). So a
// comparison that always returns true is the same as a read of the
- // EXEC register.
- Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+ // EXEC register. EXEC is wave-size bits wide, so read it at that width
+ // and zext/trunc to the return type.
+ Type *ExecTy = IC.Builder.getIntNTy(ST->getWavefrontSize());
+ StringRef ExecName = ST->isWave32() ? "exec_lo" : "exec";
+ Metadata *MDArgs[] = {MDString::get(II.getContext(), ExecName)};
MDNode *MD = MDNode::get(II.getContext(), MDArgs);
Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
- CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
- II.getType(), Args);
+ CallInst *NewCall =
+ IC.Builder.CreateIntrinsic(Intrinsic::read_register, ExecTy, Args);
NewCall->addFnAttr(Attribute::Convergent);
- NewCall->takeName(&II);
- return IC.replaceInstUsesWith(II, NewCall);
+ Value *Result = IC.Builder.CreateZExtOrTrunc(NewCall, II.getType());
+ Result->takeName(&II);
+ return IC.replaceInstUsesWith(II, Result);
}
// Canonicalize constants to RHS.
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-cmp-exec-fold-wave-size.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-cmp-exec-fold-wave-size.ll
new file mode 100644
index 0000000000000..d9e3030e53c4d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-cmp-exec-fold-wave-size.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=instcombine -S < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine -S < %s | FileCheck -check-prefix=WAVE32 %s
+
+; The "always true" amdgcn.icmp/amdgcn.fcmp fold reads EXEC, which is wave-size
+; bits wide. The read_register must use that width and register name regardless
+; of the return type, then zext/trunc.
+
+; Return type wider than the wave size on wave32: read i32 exec_lo, then zext.
+define i64 @fcmp_true_i64(float %x, float %y) {
+; WAVE64-LABEL: @fcmp_true_i64(
+; WAVE64-NEXT: [[R:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR3:[0-9]+]]
+; WAVE64-NEXT: ret i64 [[R]]
+;
+; WAVE32-LABEL: @fcmp_true_i64(
+; WAVE32-NEXT: [[TMP1:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0:![0-9]+]]) #[[ATTR3:[0-9]+]]
+; WAVE32-NEXT: [[R:%.*]] = zext i32 [[TMP1]] to i64
+; WAVE32-NEXT: ret i64 [[R]]
+;
+ %r = call i64 @llvm.amdgcn.fcmp.i64.f32(float 0.0, float 0.0, i32 1)
+ ret i64 %r
+}
+
+define i64 @icmp_true_i64(i32 %x, i32 %y) {
+; WAVE64-LABEL: @icmp_true_i64(
+; WAVE64-NEXT: [[R:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR3]]
+; WAVE64-NEXT: ret i64 [[R]]
+;
+; WAVE32-LABEL: @icmp_true_i64(
+; WAVE32-NEXT: [[TMP1:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0]]) #[[ATTR3]]
+; WAVE32-NEXT: [[R:%.*]] = zext i32 [[TMP1]] to i64
+; WAVE32-NEXT: ret i64 [[R]]
+;
+ %r = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)
+ ret i64 %r
+}
+
+; Return type narrower than the wave size on wave64: read i64 exec, then trunc.
+define i32 @fcmp_true_i32(float %x, float %y) {
+; WAVE64-LABEL: @fcmp_true_i32(
+; WAVE64-NEXT: [[TMP1:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR3]]
+; WAVE64-NEXT: [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; WAVE64-NEXT: ret i32 [[R]]
+;
+; WAVE32-LABEL: @fcmp_true_i32(
+; WAVE32-NEXT: [[R:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0]]) #[[ATTR3]]
+; WAVE32-NEXT: ret i32 [[R]]
+;
+ %r = call i32 @llvm.amdgcn.fcmp.i32.f32(float 0.0, float 0.0, i32 1)
+ ret i32 %r
+}
+
+declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32 immarg)
+declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32 immarg)
+declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32 immarg)
+;.
+; WAVE64: attributes #[[ATTR0:[0-9]+]] = { "target-cpu"="gfx900" }
+; WAVE64: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) "target-cpu"="gfx900" }
+; WAVE64: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
+; WAVE64: attributes #[[ATTR3]] = { convergent }
+;.
+; WAVE32: attributes #[[ATTR0:[0-9]+]] = { "target-cpu"="gfx1010" }
+; WAVE32: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) "target-cpu"="gfx1010" }
+; WAVE32: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
+; WAVE32: attributes #[[ATTR3]] = { convergent }
+;.
+; WAVE64: [[META0]] = !{!"exec"}
+;.
+; WAVE32: [[META0]] = !{!"exec_lo"}
+;.
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 87164cff4276e..bc43aaac6c567 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -1882,7 +1882,8 @@ define i64 @icmp_constant_inputs_false() {
define i64 @icmp_constant_inputs_true() {
; CHECK-LABEL: @icmp_constant_inputs_true(
-; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR21:[0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0:![0-9]+]]) #[[ATTR21:[0-9]+]]
+; CHECK-NEXT: [[RESULT:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: ret i64 [[RESULT]]
;
%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34)
@@ -2589,7 +2590,8 @@ define i64 @fcmp_constant_inputs_false() {
define i64 @fcmp_constant_inputs_true() {
; CHECK-LABEL: @fcmp_constant_inputs_true(
-; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR21]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0]]) #[[ATTR21]]
+; CHECK-NEXT: [[RESULT:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: ret i64 [[RESULT]]
;
%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4)
|
arsenm
left a comment
There was a problem hiding this comment.
LGTM but we probably should just delete all of this code
| CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register, | ||
| II.getType(), Args); | ||
| CallInst *NewCall = | ||
| IC.Builder.CreateIntrinsic(Intrinsic::read_register, ExecTy, Args); |
There was a problem hiding this comment.
Why do we still have this combine? We switched to preferring ballot(true) vs. read_register for getting exec a long time ago. Also, we should really remove the icmp/fcmp intrinsics
There was a problem hiding this comment.
I have removed the code, please re-check
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) ClangClang.Driver/hipspv-toolchain.hipIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
arsenm
left a comment
There was a problem hiding this comment.
Could still include the cases that were broken? or replace with ballot instead
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| @@ -0,0 +1,65 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
| ; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx900 < %s | FileCheck %s | |||
There was a problem hiding this comment.
-global-isel=0 should only be used when testing both selectors
There was a problem hiding this comment.
Not relevant anymore
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_mov_b32 s0, 0 | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) |
There was a problem hiding this comment.
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) | |
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false) |
There was a problem hiding this comment.
Not relevant anymore
| ; CHECK: ; %bb.0: | ||
| ; CHECK-NEXT: s_mov_b32 s0, exec_lo | ||
| ; CHECK-NEXT: ; return to shader part epilog | ||
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) |
There was a problem hiding this comment.
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) | |
| %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true) |
There was a problem hiding this comment.
Not relevant anymore
| SDValue Src = N->getOperand(1); | ||
| SDLoc SL(N); | ||
|
|
||
| // AMDGPUISD::SETCC produces a wave-sized mask; emit it at that width and |
There was a problem hiding this comment.
I think the i32 return type on wave64 intentionally doesn't work. This is also now going beyond what the title suggests
There was a problem hiding this comment.
Fair enough, reverted
No description provided.