-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SelectionDAG] Avoid one comparison when legalizing fmaximum #142732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
When ordering signed zero, only check the sign of one of the values. We already know at this point that both values must be +/-0.0, so it is sufficient to check one of them to correctly order them.
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-nvptx Author: Nikita Popov (nikic) ChangesWhen ordering signed zero, only check the sign of one of the values. We already know at this point that both values must be +/-0.0, so it is sufficient to check one of them to correctly order them. For example, for fmaximum, if we know LHS is Patch is 99.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142732.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c8fe8971e593c..f34bf0ca7ede0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8610,19 +8610,16 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
// fminimum/fmaximum requires -0.0 less than +0.0
if (!MinMaxMustRespectOrderedZero && !N->getFlags().hasNoSignedZeros() &&
!DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS)) {
- auto IsSpecificZero = [&](SDValue F) {
- FloatSignAsInt State;
- DAG.getSignAsIntValue(State, DL, F);
- return DAG.getSetCC(DL, CCVT, State.IntValue,
- DAG.getConstant(0, DL, State.IntValue.getValueType()),
- IsMax ? ISD::SETEQ : ISD::SETNE);
- };
SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
DAG.getConstantFP(0.0, DL, VT), ISD::SETOEQ);
- SDValue LCmp =
- DAG.getSelect(DL, VT, IsSpecificZero(LHS), LHS, MinMax, Flags);
- SDValue RCmp = DAG.getSelect(DL, VT, IsSpecificZero(RHS), RHS, LCmp, Flags);
- MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+ FloatSignAsInt State;
+ DAG.getSignAsIntValue(State, DL, LHS);
+ SDValue IsSpecificZero =
+ DAG.getSetCC(DL, CCVT, State.IntValue,
+ DAG.getConstant(0, DL, State.IntValue.getValueType()),
+ IsMax ? ISD::SETEQ : ISD::SETNE);
+ SDValue Sel = DAG.getSelect(DL, VT, IsSpecificZero, LHS, RHS, Flags);
+ MinMax = DAG.getSelect(DL, VT, IsZero, Sel, MinMax, Flags);
}
return MinMax;
diff --git a/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll b/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
index 9f542abcb80f7..b47470fb78234 100644
--- a/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
@@ -46,51 +46,46 @@ define fp128 @maximum_fp128(fp128 %x, fp128 %y) nounwind {
; CHECK-LABEL: maximum_fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: str q0, [sp, #64]
+; CHECK-NEXT: mov v2.16b, v1.16b
+; CHECK-NEXT: ldrb w8, [sp, #79]
; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-NEXT: stp q1, q0, [sp, #48]
-; CHECK-NEXT: bl __gttf2
-; CHECK-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: b.le .LBB1_2
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: b.ne .LBB1_2
; CHECK-NEXT: // %bb.1:
-; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: bl __unordtf2
-; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: stp q2, q0, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: b.eq .LBB1_4
+; CHECK-NEXT: mov v2.16b, v1.16b
+; CHECK-NEXT: b.le .LBB1_4
; CHECK-NEXT: // %bb.3:
-; CHECK-NEXT: adrp x8, .LCPI1_0
-; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldrb w8, [sp, #79]
-; CHECK-NEXT: mov v1.16b, v0.16b
-; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: b.ne .LBB1_6
+; CHECK-NEXT: str q2, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: bl __unordtf2
+; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: b.eq .LBB1_6
; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: .LBB1_6:
-; CHECK-NEXT: ldrb w8, [sp, #63]
-; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: b.ne .LBB1_8
-; CHECK-NEXT: // %bb.7:
-; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: .LBB1_8:
+; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: adrp x8, .LCPI1_1
-; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_1]
-; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: bl __eqtf2
-; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: b.ne .LBB1_10
-; CHECK-NEXT: // %bb.9:
-; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: .LBB1_10:
+; CHECK-NEXT: b.ne .LBB1_8
+; CHECK-NEXT: // %bb.7:
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .LBB1_8:
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/ARM/fp-maximum-legalization.ll b/llvm/test/CodeGen/ARM/fp-maximum-legalization.ll
index a3ab144356e16..f3e3e17a22eaf 100644
--- a/llvm/test/CodeGen/ARM/fp-maximum-legalization.ll
+++ b/llvm/test/CodeGen/ARM/fp-maximum-legalization.ll
@@ -4,7 +4,7 @@
define double @maximum_double(double %x, double %y) nounwind {
; CHECK-LABEL: maximum_double:
; CHECK: @ %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: sub sp, sp, #8
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: mov r2, #0
; CHECK-NEXT: vmov d16, r0, r1
@@ -12,32 +12,26 @@ define double @maximum_double(double %x, double %y) nounwind {
; CHECK-NEXT: vcmp.f64 d16, d17
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-NEXT: vstr d16, [sp, #8]
-; CHECK-NEXT: vstr d17, [sp]
-; CHECK-NEXT: ldrb r1, [sp, #15]
+; CHECK-NEXT: vldr d18, .LCPI0_0
; CHECK-NEXT: vmov.f64 d19, d17
+; CHECK-NEXT: vstr d16, [sp]
+; CHECK-NEXT: ldrb r1, [sp, #7]
; CHECK-NEXT: clz r1, r1
-; CHECK-NEXT: vldr d18, .LCPI0_0
; CHECK-NEXT: movwvs r2, #1
; CHECK-NEXT: movwgt r3, #1
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: vmovne.f64 d19, d16
; CHECK-NEXT: cmp r2, #0
-; CHECK-NEXT: ldrb r2, [sp, #7]
; CHECK-NEXT: vmovne.f64 d19, d18
; CHECK-NEXT: lsrs r1, r1, #5
-; CHECK-NEXT: clz r1, r2
; CHECK-NEXT: vcmp.f64 d19, #0
-; CHECK-NEXT: vmov.f64 d18, d19
-; CHECK-NEXT: vmovne.f64 d18, d16
-; CHECK-NEXT: lsrs r1, r1, #5
-; CHECK-NEXT: vmovne.f64 d18, d17
+; CHECK-NEXT: vmovne.f64 d17, d16
; CHECK-NEXT: vmrs APSR_nzcv, fpscr
; CHECK-NEXT: movweq r0, #1
; CHECK-NEXT: cmp r0, #0
-; CHECK-NEXT: vmovne.f64 d19, d18
+; CHECK-NEXT: vmovne.f64 d19, d17
; CHECK-NEXT: vmov r0, r1, d19
-; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 3
; CHECK-NEXT: @ %bb.1:
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 096649e5bde43..e16ddb778099c 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -1351,30 +1351,28 @@ define bfloat @test_roundeven(bfloat %a) {
define bfloat @test_maximum(bfloat %a, bfloat %b) {
; SM70-LABEL: test_maximum(
; SM70: {
-; SM70-NEXT: .reg .pred %p<6>;
-; SM70-NEXT: .reg .b16 %rs<8>;
+; SM70-NEXT: .reg .pred %p<5>;
+; SM70-NEXT: .reg .b16 %rs<7>;
; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
+; SM70-NEXT: setp.eq.s16 %p1, %rs1, 0;
; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
+; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
; SM70-NEXT: shl.b32 %r2, %r1, 16;
; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
; SM70-NEXT: shl.b32 %r4, %r3, 16;
-; SM70-NEXT: setp.gt.f32 %p1, %r4, %r2;
-; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; SM70-NEXT: setp.nan.f32 %p2, %r4, %r2;
-; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
-; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0;
-; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3;
-; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
-; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4;
-; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: setp.gt.f32 %p2, %r4, %r2;
+; SM70-NEXT: selp.b16 %rs4, %rs1, %rs2, %p2;
+; SM70-NEXT: setp.nan.f32 %p3, %r4, %r2;
+; SM70-NEXT: selp.b16 %rs5, 0x7FC0, %rs4, %p3;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
; SM70-NEXT: shl.b32 %r6, %r5, 16;
-; SM70-NEXT: setp.eq.f32 %p5, %r6, 0f00000000;
-; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
-; SM70-NEXT: st.param.b16 [func_retval0], %rs7;
+; SM70-NEXT: setp.eq.f32 %p4, %r6, 0f00000000;
+; SM70-NEXT: selp.b16 %rs6, %rs3, %rs5, %p4;
+; SM70-NEXT: st.param.b16 [func_retval0], %rs6;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_maximum(
@@ -1475,48 +1473,44 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70-LABEL: test_maximum_v2(
; SM70: {
-; SM70-NEXT: .reg .pred %p<11>;
-; SM70-NEXT: .reg .b16 %rs<19>;
+; SM70-NEXT: .reg .pred %p<9>;
+; SM70-NEXT: .reg .b16 %rs<15>;
; SM70-NEXT: .reg .b32 %r<16>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
+; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT: setp.eq.s16 %p1, %rs4, 0;
+; SM70-NEXT: selp.b16 %rs7, %rs4, %rs2, %p1;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
; SM70-NEXT: shl.b32 %r4, %r3, 16;
-; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
; SM70-NEXT: shl.b32 %r6, %r5, 16;
-; SM70-NEXT: setp.gt.f32 %p1, %r6, %r4;
-; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; SM70-NEXT: setp.nan.f32 %p2, %r6, %r4;
-; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2;
-; SM70-NEXT: setp.eq.s16 %p3, %rs4, 0;
-; SM70-NEXT: selp.b16 %rs9, %rs4, %rs6, %p3;
-; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
-; SM70-NEXT: selp.b16 %rs12, %rs2, %rs9, %p4;
-; SM70-NEXT: cvt.u32.u16 %r7, %rs6;
+; SM70-NEXT: setp.gt.f32 %p2, %r6, %r4;
+; SM70-NEXT: selp.b16 %rs8, %rs4, %rs2, %p2;
+; SM70-NEXT: setp.nan.f32 %p3, %r6, %r4;
+; SM70-NEXT: selp.b16 %rs9, 0x7FC0, %rs8, %p3;
+; SM70-NEXT: cvt.u32.u16 %r7, %rs9;
; SM70-NEXT: shl.b32 %r8, %r7, 16;
-; SM70-NEXT: setp.eq.f32 %p5, %r8, 0f00000000;
-; SM70-NEXT: selp.b16 %rs13, %rs12, %rs6, %p5;
+; SM70-NEXT: setp.eq.f32 %p4, %r8, 0f00000000;
+; SM70-NEXT: selp.b16 %rs10, %rs7, %rs9, %p4;
+; SM70-NEXT: setp.eq.s16 %p5, %rs3, 0;
+; SM70-NEXT: selp.b16 %rs11, %rs3, %rs1, %p5;
; SM70-NEXT: cvt.u32.u16 %r9, %rs1;
; SM70-NEXT: shl.b32 %r10, %r9, 16;
; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
; SM70-NEXT: shl.b32 %r12, %r11, 16;
; SM70-NEXT: setp.gt.f32 %p6, %r12, %r10;
-; SM70-NEXT: selp.b16 %rs14, %rs3, %rs1, %p6;
+; SM70-NEXT: selp.b16 %rs12, %rs3, %rs1, %p6;
; SM70-NEXT: setp.nan.f32 %p7, %r12, %r10;
-; SM70-NEXT: selp.b16 %rs15, 0x7FC0, %rs14, %p7;
-; SM70-NEXT: setp.eq.s16 %p8, %rs3, 0;
-; SM70-NEXT: selp.b16 %rs16, %rs3, %rs15, %p8;
-; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0;
-; SM70-NEXT: selp.b16 %rs17, %rs1, %rs16, %p9;
-; SM70-NEXT: cvt.u32.u16 %r13, %rs15;
+; SM70-NEXT: selp.b16 %rs13, 0x7FC0, %rs12, %p7;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs13;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
-; SM70-NEXT: setp.eq.f32 %p10, %r14, 0f00000000;
-; SM70-NEXT: selp.b16 %rs18, %rs17, %rs15, %p10;
-; SM70-NEXT: mov.b32 %r15, {%rs18, %rs13};
+; SM70-NEXT: setp.eq.f32 %p8, %r14, 0f00000000;
+; SM70-NEXT: selp.b16 %rs14, %rs11, %rs13, %p8;
+; SM70-NEXT: mov.b32 %r15, {%rs14, %rs10};
; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
;
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index 441fdec7ce5c0..d84d725bf72cf 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -612,27 +612,25 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) {
define half @minimum_half(half %a, half %b) {
; CHECK-NOF16-LABEL: minimum_half(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<6>;
-; CHECK-NOF16-NEXT: .reg .b16 %rs<8>;
+; CHECK-NOF16-NEXT: .reg .pred %p<5>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<7>;
; CHECK-NOF16-NEXT: .reg .b32 %r<4>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-NOF16-NEXT: setp.ne.s16 %p1, %rs1, 0;
; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
-; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1;
-; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1;
-; CHECK-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
-; CHECK-NOF16-NEXT: setp.ne.s16 %p3, %rs1, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3;
-; CHECK-NOF16-NEXT: setp.ne.s16 %p4, %rs2, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
-; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs7;
+; CHECK-NOF16-NEXT: setp.lt.f32 %p2, %r2, %r1;
+; CHECK-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs2, %p2;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p3, %r2, %r1;
+; CHECK-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs5;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000;
+; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs3, %rs5, %p4;
+; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs6;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_half(
@@ -648,27 +646,25 @@ define half @minimum_half(half %a, half %b) {
;
; CHECK-SM80-NOF16-LABEL: minimum_half(
; CHECK-SM80-NOF16: {
-; CHECK-SM80-NOF16-NEXT: .reg .pred %p<6>;
-; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<8>;
+; CHECK-SM80-NOF16-NEXT: .reg .pred %p<5>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>;
; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>;
; CHECK-SM80-NOF16-EMPTY:
; CHECK-SM80-NOF16-NEXT: // %bb.0:
; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-SM80-NOF16-NEXT: setp.ne.s16 %p1, %rs1, 0;
; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1;
-; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
-; CHECK-SM80-NOF16-NEXT: setp.ne.s16 %p3, %rs1, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3;
-; CHECK-SM80-NOF16-NEXT: setp.ne.s16 %p4, %rs2, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
-; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs7;
+; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p2, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, %rs1, %rs2, %p2;
+; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, 0x7E00, %rs4, %p3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs5;
+; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs3, %rs5, %p4;
+; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs6;
; CHECK-SM80-NOF16-NEXT: ret;
%x = call half @llvm.minimum.f16(half %a, half %b)
ret half %x
@@ -677,22 +673,20 @@ define half @minimum_half(half %a, half %b) {
define float @minimum_float(float %a, float %b) {
; CHECK-NOF16-LABEL: minimum_float(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
+; CHECK-NOF16-NEXT: .reg .pred %p<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<7>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_float_param_0];
+; CHECK-NOF16-NEXT: setp.ne.s32 %p1, %r1, 0;
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [minimum_float_param_1];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r2;
-; CHECK-NOF16-NEXT: min.f32 %r3, %r1, %r2;
-; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
-; CHECK-NOF16-NEXT: setp.ne.s32 %p2, %r1, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2;
-; CHECK-NOF16-NEXT: setp.ne.s32 %p3, %r2, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NOF16-NEXT: selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r1, %r2;
+; CHECK-NOF16-NEXT: min.f32 %r4, %r1, %r2;
+; CHECK-NOF16-NEXT: selp.f32 %r5, 0f7FC00000, %r4, %p2;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r5, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r6, %r3, %r5, %p3;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_float(
@@ -728,13 +722,13 @@ define float @minimum_imm1(float %a) {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm1_param_0];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
-; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1;
-; CHECK-NOF16-NEXT: setp.ne.s32 %p2, %r1, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3;
+; CHECK-NOF16-NEXT: setp.ne.s32 %p1, %r1, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r2, %r1, 0f00000000, %p1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r1, %r1;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r1, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p2;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r2, %r4, %p3;
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NOF16-NEXT: ret;
;
@@ -769,13 +763,13 @@ define float @minimum_imm2(float %a) {
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm2_param_0];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
-; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1;
-; CHECK-NOF16-NEXT: setp.ne.s32 %p2, %r1, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3;
+; CHECK-NOF16-NEXT: setp.ne.s32 %p1, %r1, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r2, %r1, 0f00000000, %p1;
+; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r1, %r1;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r1, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p2;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r2, %r4, %p3;
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NOF16-NEXT: ret;
;
@@ -805,22 +799,20 @@ define float @minimum_imm2(float %a) {
define float @minimum_float_ftz(float %a, float %b) #1 {
; CHECK-NOF16-LABEL: minimum_float_ftz(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
+; CHECK-NOF16-NEXT: .reg .pred %p<4>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<7>;
; CHECK-NOF...
[truncated]
|
Isn't it just one of them that must be +/-0.0? We only know that the other value is smaller (max)/bigger (min) than 0.
If we were actually checking the sign this would work, but we compare equality with integer 0. |
You are right. I've reverted this for now. |
When ordering signed zero, only check the sign of one of the values. We already know at this point that both values must be +/-0.0, so it is sufficient to check one of them to correctly order them.
For example, for fmaximum, if we know LHS is
+0.0
then we can always select LHS, value of RHS does not matter. If LHS is-0.0
we can always select RHS, value of RHS doesn't matter.