-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Handle other fmin flavors in fract combine #141987
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle other fmin flavors in fract combine #141987
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesSince the input is either known not-nan, or we have explicit use Patch is 42.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141987.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 52177a2523bcb..a3f668e6d65ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -327,7 +327,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
- bool visitMinNum(IntrinsicInst &I);
+ bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
};
@@ -2197,7 +2197,9 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
case Intrinsic::bitreverse:
return visitBitreverseIntrinsicInst(I);
case Intrinsic::minnum:
- return visitMinNum(I);
+ case Intrinsic::minimumnum:
+ case Intrinsic::minimum:
+ return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
default:
@@ -2216,7 +2218,9 @@ bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
}
/// Match non-nan fract pattern.
-/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
+/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
+/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
+/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
///
/// If fract is a useful instruction for the subtarget. Does not account for the
/// nan handling; the instruction has a nan check on the input value.
@@ -2224,7 +2228,12 @@ Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
if (ST.hasFractBug())
return nullptr;
- if (I.getIntrinsicID() != Intrinsic::minnum)
+ Intrinsic::ID IID = I.getIntrinsicID();
+
+ // The value is only used in contexts where we know the input isn't a nan, so
+ // any of the fmin variants are fine.
+ if (IID != Intrinsic::minnum &&
+ IID != Intrinsic::minimumnum & IID != Intrinsic::minimum)
return nullptr;
Type *Ty = I.getType();
@@ -2270,7 +2279,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
return insertValues(Builder, FractArg->getType(), ResultVals);
}
-bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
+bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *FractArg = matchFractPat(I);
if (!FractArg)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 9d98a8dab0501..4ee48716439bd 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -2996,19 +2996,30 @@ entry:
}
define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly captures(none) %ip) {
-; IR-LABEL: define float @safe_math_fract_f32_minimum(
-; IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
-; IR-NEXT: [[ENTRY:.*:]]
-; IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
-; IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
-; IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
-; IR-NEXT: [[UNO:%.*]] = fcmp uno float [[X]], 0.000000e+00
-; IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[X]], float [[MIN]]
-; IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
-; IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
-; IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
-; IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
-; IR-NEXT: ret float [[COND6]]
+; GFX6-IR-LABEL: define float @safe_math_fract_f32_minimum(
+; GFX6-IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; GFX6-IR-NEXT: [[ENTRY:.*:]]
+; GFX6-IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; GFX6-IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
+; GFX6-IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
+; GFX6-IR-NEXT: [[UNO:%.*]] = fcmp uno float [[X]], 0.000000e+00
+; GFX6-IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[X]], float [[MIN]]
+; GFX6-IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; GFX6-IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; GFX6-IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; GFX6-IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; GFX6-IR-NEXT: ret float [[COND6]]
+;
+; IR-FRACT-LABEL: define float @safe_math_fract_f32_minimum(
+; IR-FRACT-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; IR-FRACT-NEXT: [[ENTRY:.*:]]
+; IR-FRACT-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; IR-FRACT-NEXT: [[COND:%.*]] = call float @llvm.amdgcn.fract.f32(float [[X]])
+; IR-FRACT-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; IR-FRACT-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; IR-FRACT-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; IR-FRACT-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; IR-FRACT-NEXT: ret float [[COND6]]
;
; GFX6-LABEL: safe_math_fract_f32_minimum:
; GFX6: ; %bb.0: ; %entry
@@ -3035,20 +3046,14 @@ define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly c
; GFX7-LABEL: safe_math_fract_f32_minimum:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_floor_f32_e32 v3, v0
-; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
-; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX7-NEXT: v_fract_f32_e32 v4, v0
; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_floor_f32_e32 v3, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GFX7-NEXT: buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3057,16 +3062,10 @@ define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly c
; GFX8-LABEL: safe_math_fract_f32_minimum:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_floor_f32_e32 v3, v0
-; GFX8-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX8-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc00000
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX8-NEXT: v_fract_f32_e32 v4, v0
; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4
+; GFX8-NEXT: v_floor_f32_e32 v3, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GFX8-NEXT: global_store_dword v[1:2], v3, off
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -3075,18 +3074,12 @@ define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly c
; GFX11-LABEL: safe_math_fract_f32_minimum:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_floor_f32_e32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX11-NEXT: global_store_b32 v[1:2], v3, off
-; GFX11-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v5, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX11-NEXT: v_fract_f32_e32 v3, v0
; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT: v_floor_f32_e32 v4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: safe_math_fract_f32_minimum:
@@ -3096,17 +3089,12 @@ define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly c
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_floor_f32_e32 v3, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX12-NEXT: global_store_b32 v[1:2], v3, off
-; GFX12-NEXT: v_minimum_f32 v4, 0x3f7fffff, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX12-NEXT: v_fract_f32_e32 v3, v0
; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT: v_floor_f32_e32 v4, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT: global_store_b32 v[1:2], v4, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
@@ -3122,19 +3110,30 @@ entry:
}
define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeonly captures(none) %ip) {
-; IR-LABEL: define float @safe_math_fract_f32_minimum_swap(
-; IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
-; IR-NEXT: [[ENTRY:.*:]]
-; IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
-; IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
-; IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
-; IR-NEXT: [[UNO:%.*]] = fcmp ord float [[X]], 0.000000e+00
-; IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[MIN]], float [[X]]
-; IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
-; IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
-; IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
-; IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
-; IR-NEXT: ret float [[COND6]]
+; GFX6-IR-LABEL: define float @safe_math_fract_f32_minimum_swap(
+; GFX6-IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; GFX6-IR-NEXT: [[ENTRY:.*:]]
+; GFX6-IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; GFX6-IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
+; GFX6-IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
+; GFX6-IR-NEXT: [[UNO:%.*]] = fcmp ord float [[X]], 0.000000e+00
+; GFX6-IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[MIN]], float [[X]]
+; GFX6-IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; GFX6-IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; GFX6-IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; GFX6-IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; GFX6-IR-NEXT: ret float [[COND6]]
+;
+; IR-FRACT-LABEL: define float @safe_math_fract_f32_minimum_swap(
+; IR-FRACT-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; IR-FRACT-NEXT: [[ENTRY:.*:]]
+; IR-FRACT-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; IR-FRACT-NEXT: [[COND:%.*]] = call float @llvm.amdgcn.fract.f32(float [[X]])
+; IR-FRACT-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; IR-FRACT-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; IR-FRACT-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; IR-FRACT-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; IR-FRACT-NEXT: ret float [[COND6]]
;
; GFX6-LABEL: safe_math_fract_f32_minimum_swap:
; GFX6: ; %bb.0: ; %entry
@@ -3161,20 +3160,14 @@ define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeo
; GFX7-LABEL: safe_math_fract_f32_minimum_swap:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_floor_f32_e32 v3, v0
-; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, 0x7fc00000
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GFX7-NEXT: v_fract_f32_e32 v4, v0
; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_floor_f32_e32 v3, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GFX7-NEXT: buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3183,16 +3176,10 @@ define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeo
; GFX8-LABEL: safe_math_fract_f32_minimum_swap:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_floor_f32_e32 v3, v0
-; GFX8-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX8-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, 0x7fc00000
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GFX8-NEXT: v_fract_f32_e32 v4, v0
; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4
+; GFX8-NEXT: v_floor_f32_e32 v3, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
; GFX8-NEXT: global_store_dword v[1:2], v3, off
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -3201,18 +3188,12 @@ define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeo
; GFX11-LABEL: safe_math_fract_f32_minimum_swap:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_floor_f32_e32 v3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX11-NEXT: global_store_b32 v[1:2], v3, off
-; GFX11-NEXT: v_min_f32_e32 v5, 0x3f7fffff, v4
-; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v5, vcc_lo
-; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo
+; GFX11-NEXT: v_fract_f32_e32 v3, v0
; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT: v_floor_f32_e32 v4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX11-NEXT: global_store_b32 v[1:2], v4, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: safe_math_fract_f32_minimum_swap:
@@ -3222,17 +3203,12 @@ define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_floor_f32_e32 v3, v0
-; GFX12-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX12-NEXT: global_store_b32 v[1:2], v3, off
-; GFX12-NEXT: v_minimum_f32 v4, 0x3f7fffff, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo
+; GFX12-NEXT: v_fract_f32_e32 v3, v0
; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX12-NEXT: v_floor_f32_e32 v4, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-NEXT: global_store_b32 v[1:2], v4, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call float @llvm.floor.f32(float %x)
@@ -3248,19 +3224,30 @@ entry:
}
define float @safe_math_fract_f32_minimumnum(float %x, ptr addrspace(1) writeonly captures(none) %ip) {
-; IR-LABEL: define float @safe_math_fract_f32_minimumnum(
-; IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
-; IR-NEXT: [[ENTRY:.*:]]
-; IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
-; IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
-; IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimumnum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
-; IR-NEXT: [[UNO:%.*]] = fcmp uno float [[X]], 0.000000e+00
-; IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[X]], float [[MIN]]
-; IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
-; IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
-; IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
-; IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
-; IR-NEXT: ret float [[COND6]]
+; GFX6-IR-LABEL: define float @safe_math_fract_f32_minimumnum(
+; GFX6-IR-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; GFX6-IR-NEXT: [[ENTRY:.*:]]
+; GFX6-IR-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; GFX6-IR-NEXT: [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
+; GFX6-IR-NEXT: [[MIN:%.*]] = tail call float @llvm.minimumnum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
+; GFX6-IR-NEXT: [[UNO:%.*]] = fcmp uno float [[X]], 0.000000e+00
+; GFX6-IR-NEXT: [[COND:%.*]] = select i1 [[UNO]], float [[X]], float [[MIN]]
+; GFX6-IR-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; GFX6-IR-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; GFX6-IR-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; GFX6-IR-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; GFX6-IR-NEXT: ret float [[COND6]]
+;
+; IR-FRACT-LABEL: define float @safe_math_fract_f32_minimumnum(
+; IR-FRACT-SAME: float [[X:%.*]], ptr addrspace(1) writeonly captures(none) [[IP:%.*]]) #[[ATTR0]] {
+; IR-FRACT-NEXT: [[ENTRY:.*:]]
+; IR-FRACT-NEXT: [[FLOOR:%.*]] = tail call float @llvm.floor.f32(float [[X]])
+; IR-FRACT-NEXT: [[COND:%.*]] = call float @llvm.amdgcn.fract.f32(float [[X]])
+; IR-FRACT-NEXT: [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
+; IR-FRACT-NEXT: [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
+; IR-FRACT-NEXT: [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; IR-FRACT-NEXT: store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
+; IR-FRACT-NEXT: ret float [[COND6]]
;
; GFX6-LABEL: safe_math_fract_f32_minimumnum:
; GFX6: ; %bb.0: ; %entry
@@ -3284,17 +3271,14 @@ define float @safe_math_fract_f32_minimumnum(float %x, ptr addrspace(1) writeonl
; GFX7-LABEL: safe_math_fract_f32_minimumnum:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_floor_f32_e32 v3, v0
-; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fffff, v4
-; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX7-NEXT: v_fract_f32_e32 v4, v0
; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NE...
[truncated]
|
Merge activity
|
966fa32
to
c057908
Compare
c78c3c6
to
e65b6a4
Compare
Since the input is either known not-nan, or we have explicit use code checking if the input is a nan, any of the 3 is valid to match.
e65b6a4
to
50cf4b6
Compare
Since the input is either known not-nan, or we have explicit use code checking if the input is a nan, any of the 3 is valid to match.
Since the input is either known not-nan, or we have explicit use code checking if the input is a nan, any of the 3 is valid to match.
Since the input is either known not-nan, or we have explicit use code checking if the input is a nan, any of the 3 is valid to match.
Since the input is either known not-nan, or we have explicit use
code checking if the input is a nan, any of the 3 is valid to match.