-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Combine nnan fminimum/fmaximum to fminnum_ieee/fmaxnum_ieee #142217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This improves codegen for gfx950, where fminimum/fmaximum are legal through fminimum3/fmaximum3, so may have an additional encoding cost.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis improves codegen for gfx950, where fminimum/fmaximum are Patch is 34.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142217.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c61c52ec5843e..2493b4f1bb0a7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13822,6 +13822,17 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
return Res;
}
+ // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
+ // for some types, but at a higher cost since it's implemented with a 3
+ // operand form.
+ const SDNodeFlags Flags = N->getFlags();
+ if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
+ !Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
+ unsigned NewOpc =
+ Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+ return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index a56c92785d487..92a2f54841eed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -113,17 +113,11 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan:
; GFX10: ; %bb.0:
@@ -270,17 +264,11 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_f16__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -771,17 +759,11 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v2f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v2f16__nnan:
; GFX10: ; %bb.0:
@@ -939,17 +921,11 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -1296,19 +1272,12 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v3f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v3f16__nnan:
; GFX10: ; %bb.0:
@@ -1501,19 +1470,12 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -1741,19 +1703,12 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v4f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v4f16__nnan:
; GFX10: ; %bb.0:
@@ -1981,19 +1936,12 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT: v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -2788,4 +2736,3 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 826bf427503ab..6c4f13a4eab8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -85,17 +85,11 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_f32__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f32__nnan:
; GFX10: ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
; GFX8-NEXT: v_max_f32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_f32__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f32__nnan_nsz:
; GFX10: ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v2f32__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v2f32__nnan:
; GFX10: ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
; GFX10: ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
; GFX8-NEXT: v_max_f32_e32 v2, v2, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v3f32__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT: v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v3f32__nnan:
; GFX10: ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
; GFX8-NEXT: v_max_f32_e32 v2, v2, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT: v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
; GFX10: ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v4f32__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT: v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT: v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v4f32__nnan:
; GFX10: ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT: v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT: v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT: v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT: v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT: v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
; GFX10: ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 3dcc70b0ea3b6..9e82b41bb9585 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -91,17 +91,11 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_minimum_f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan:
; GFX10: ; %bb.0:
@@ -225,17 +219,11 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_minimum_f16__nnan_nsz:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan_nsz:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan_nsz:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -646,17 +634,11 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_minimum_v2f16__nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__...
[truncated]
|
I'm not sure this plays well with the generic DAGCombiner. It prefers fminimum/fmaximum over the others when available; it might be better to just add this as an alternative pattern based on the flag |
The DAGCombiner doesn't directly do any transforms between the different min/maxes, but it should. The preference is only applied when matching the compare and select patterns into one of these |
This improves codegen for gfx950, where fminimum/fmaximum are
legal through fminimum3/fmaximum3, so may have an additional
encoding cost.