-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU][True16][CodeGen] true16 codegen for valu op #124797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] true16 codegen for valu op #124797
Conversation
e657206
to
49410f8
Compare
49410f8
to
3e2646c
Compare
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) Changestrue16 selection for valu ops, enable Patch is 67.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124797.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d5d54337306c0..98a06670e3d90 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -815,7 +815,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (Fix16BitCopies) {
if (((Size == 16) != (SrcSize == 16))) {
// Non-VGPR Src and Dst will later be expanded back to 32 bits.
- assert(ST.hasTrue16BitInsts());
+ assert(ST.useRealTrue16Insts());
Register &RegToFix = (Size == 32) ? DestReg : SrcReg;
MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
RegToFix = SubReg;
@@ -989,7 +989,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (ST.hasTrue16BitInsts()) {
+ if (ST.useRealTrue16Insts()) {
if (IsSGPRSrc) {
assert(SrcLow);
SrcReg = NewSrcReg;
@@ -5579,9 +5579,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
: AMDGPU::V_FLOOR_F16_fake16_e64;
case AMDGPU::S_TRUNC_F16:
- return AMDGPU::V_TRUNC_F16_fake16_e64;
+ return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
+ : AMDGPU::V_TRUNC_F16_fake16_e64;
case AMDGPU::S_RNDNE_F16:
- return AMDGPU::V_RNDNE_F16_fake16_e64;
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
+ : AMDGPU::V_RNDNE_F16_fake16_e64;
case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
@@ -5589,17 +5591,27 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
- case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
- case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
- case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
- case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+ case AMDGPU::S_ADD_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
+ : AMDGPU::V_ADD_F16_fake16_e64;
+ case AMDGPU::S_SUB_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
+ : AMDGPU::V_SUB_F16_fake16_e64;
+ case AMDGPU::S_MIN_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
+ : AMDGPU::V_MIN_F16_fake16_e64;
+ case AMDGPU::S_MAX_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
+ : AMDGPU::V_MAX_F16_fake16_e64;
case AMDGPU::S_MINIMUM_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
: AMDGPU::V_MINIMUM_F16_fake16_e64;
case AMDGPU::S_MAXIMUM_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
- case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
+ case AMDGPU::S_MUL_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
+ : AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
case AMDGPU::S_FMAC_F16:
@@ -5664,15 +5676,25 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
: AMDGPU::V_CMP_NLT_F16_fake16_e64;
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
- case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
+ case AMDGPU::V_S_EXP_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
+ : AMDGPU::V_EXP_F16_fake16_e64;
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
- case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
+ case AMDGPU::V_S_LOG_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
+ : AMDGPU::V_LOG_F16_fake16_e64;
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
- case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
+ case AMDGPU::V_S_RCP_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
+ : AMDGPU::V_RCP_F16_fake16_e64;
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
- case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
+ case AMDGPU::V_S_RSQ_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
+ : AMDGPU::V_RSQ_F16_fake16_e64;
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
- case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
+ case AMDGPU::V_S_SQRT_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
+ : AMDGPU::V_SQRT_F16_fake16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index d81faf91801b0..235ec22ba5c60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_pow_f32(float %x, float %y) {
; GFX6-LABEL: v_pow_f32:
@@ -371,19 +372,33 @@ define half @v_pow_f16(half %x, half %y) {
; GFX10-NEXT: v_exp_f16_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%pow = call half @llvm.pow.f16(half %x, half %y)
ret half %pow
}
@@ -474,31 +489,54 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
ret <2 x half> %pow
}
@@ -597,33 +635,57 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
ret <2 x half> %pow
@@ -723,32 +785,56 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-FAKE16: ...
[truncated]
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/13848 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/13795 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/260 Here is the relevant piece of the build log for the reference
|
There seems to a test failure caused by update in another PR. Getting up a fix right now |
Get a fix up here #128905 and waiting for CI |
This is a NFC change. Update the test file and fix the build #124797 is causing a build issue
… op (#128905) This is a NFC change. Update the test file and fix the build llvm/llvm-project#124797 is causing a build issue
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/12005 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/24003 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/20534 Here is the relevant piece of the build log for the reference
|
true16 selection for valu ops, enable
real-true16
attribute and update the codegen test