-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU][True16] added Pre-RA hint to improve copy elimination #103366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16] added Pre-RA hint to improve copy elimination #103366
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesPatch is 27.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103366.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4467836cffc56..c7f3fb872c1a0 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,11 +22,21 @@
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -236,5 +246,38 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (!ST.useRealTrue16Insts())
+ return Changed;
+
+ // Add RA hints to improve True16 COPY elimination.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+ }
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ee72837a50fc4..4e5fa782345cc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3327,6 +3327,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ switch (Hint.first) {
+ case AMDGPURI::Size32: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys =
+ getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+ &AMDGPU::VGPR_32RegClass);
+ }
+
+ // Prefer the paired physreg.
+ if (PairedPhys)
+ // isLo(Paired) is implicitly true here from the API of
+ // getMatchingSuperReg.
+ Hints.push_back(PairedPhys);
+ return false;
+ }
+ case AMDGPURI::Size16: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys)
+ Hints.push_back(PairedPhys);
+ else {
+ // Add all the lo16 physregs.
+ // When the Paired operand has not yet been assigned a physreg it is
+ // better to try putting VirtReg in a lo16 register, because possibly
+ // later Paired can be assigned to the overlapping register and the COPY
+ // can be eliminated.
+ for (MCPhysReg PhysReg : Order) {
+ if (PhysReg == PairedPhys || AMDGPU::isHi(PhysReg, *this))
+ continue;
+ if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+ !MRI.isReserved(PhysReg))
+ Hints.push_back(PhysReg);
+ }
+ }
+ return false;
+ }
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
+}
+
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 88d5686720985..622e86d03048a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
@@ -326,6 +333,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 1094b768f1bd1..64329c657b814 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 4faa482ede59f..24412cff02ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -164,12 +164,9 @@ define amdgpu_kernel void @ceil_v2f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l
-; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-NEXT: v_ceil_f16_e32 v1.l, v1.l
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 61f6c9f7f0e6f..c9d05c886d99b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -165,12 +165,9 @@ define amdgpu_kernel void @floor_v2f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l
-; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-NEXT: v_floor_f16_e32 v1.l, v1.l
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 72e86f1f6f999..4c3071be3d28f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -217,9 +217,8 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i8:
@@ -307,9 +306,7 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i16:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i16:
@@ -477,14 +474,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -546,13 +540,10 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v4.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -610,12 +601,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -665,13 +653,10 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -740,16 +725,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -820,15 +802,12 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v6.l, v3.l
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v4, v5
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -895,12 +874,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v5.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -958,13 +934,10 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v4.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: ...
[truncated]
|
Hi @arsenm @Sisyph This PR is opened from #102620 (comment) |
Is this not already the order in the classes? You can explicitly override the allocation order on the register class definition |
I think it's already implemented in this way. This patch is to address the additional COPY issue when 16bit data being passed between 16 and 32bit instructions |
What if you added CostPerUse to the high halves of the registers? |
Thanks Matt I will try this. I will lower the priortiy of this PR since the upstreaming of the other true16 changes will be more urgent |
a39af9b
to
889b4cb
Compare
I don't think CostPerUse is the right solution, because we want to take use instruction context into account to decide whether a value should go into v.l or v.h. CostPerUse would tell you to always use .l over .h. We want to use .h whenever possible, because that's the way to reduce register pressure, unless the value in the register would later be forced into a VGPR32 (by being used in a 32-bit alu) |
889b4cb
to
0f31a55
Compare
rebased and update tests |
/// This pass also adds register allocation hints to COPY. | ||
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints. | ||
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These three lines are duplicated below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that 16-bit values can live in either VGPR_16 or VGPR_32, this approach seems reasonable to me.
Eventually I think we should aim to clean this up such that 16-bit values always live in VGPR_16.
and 32bit register copy
0f31a55
to
ca4d90b
Compare
rebased and fixed test |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/140/builds/18854 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/1187 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/14979 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/14740 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/14810 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/14418 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/108/builds/10337 Here is the relevant piece of the build log for the reference
|
Not sure why this hit the buildbot as I just rebased and update the patch. Hmmmm, I guess I might be unlucky that some other PR just get merged before I merged this one. Working on getting up a fix |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/95/builds/10659 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/204/builds/3255 Here is the relevant piece of the build log for the reference
|
Fix is up here #131028 |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/9658 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/35/builds/8113 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/143/builds/6130 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/198/builds/2781 Here is the relevant piece of the build log for the reference
|
… (#131028) This is a NFC patch llvm/llvm-project#103366 hit a buildbot failure with i1-to-bf16.ll. Update the test to fix the build. Also remove duplicated comments added in llvm/llvm-project#103366
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/76/builds/7710 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/85/builds/6447 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/94/builds/5203 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/164/builds/8038 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/12944 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/153/builds/25556 Here is the relevant piece of the build log for the reference
|
if (Dst.isVirtual() && | ||
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && | ||
Src.isPhysical() && | ||
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this situation should be a hard machine verifier error
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/21876 Here is the relevant piece of the build log for the reference
|
…103366) The allocation order of 16 bit registers is vgpr0lo16, vgpr0hi16, vgpr1lo16, vgpr1hi16, vgpr2lo16.... We prefer (essentially require) that allocation order, because it uses the minimum number of registers. But when you have 16 bit data passing between 16 and 32 bit instructions you get lots of COPY. This patch teach the compiler that a COPY of a 16-bit value from a 32 bit register to a lo-half 16 bit register is free, to a hi-half 16 bit register is not. This might get improved to coalescing with additional cases, and perhaps as an alternative to the RA hints. For now upstreaming this solution first.
This is a NFC patch llvm#103366 hit a buildbot failure with i1-to-bf16.ll. Update the test to fix the build. Also remove duplicated comments added in llvm#103366
The allocation order of 16 bit registers is vgpr0lo16, vgpr0hi16, vgpr1lo16, vgpr1hi16, vgpr2lo16.... We prefer (essentially require) that allocation order, because it uses the minimum number of registers. But when you have 16 bit data passing between 16 and 32 bit instructions you get lots of COPY.
This patch teach the compiler that a COPY of a 16-bit value from a 32 bit register to a lo-half 16 bit register is free, to a hi-half 16 bit register is not.
This might get improved to coalescing with additional cases, and perhaps as an alternative to the RA hints. For now upstreaming this solution first.