-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU][True16][CodeGen] 16bit spill support in true16 mode #128060
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] 16bit spill support in true16 mode #128060
Conversation
a665051
to
10f29f4
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesEnables 16-bit values to be spilled to scratch. Note, the memory instructions used are defined as reading and writing VGPR_32, but do not clobber the unspecified 16-bits of those registers, and so spills and reloads of lo and hi halves of the registers work. Patch is 25.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128060.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2cf6de73fa90c..7ecb089373692 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1580,6 +1580,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_SAVE;
case 4:
return AMDGPU::SI_SPILL_V32_SAVE;
case 8:
@@ -1807,6 +1809,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_RESTORE;
case 4:
return AMDGPU::SI_SPILL_V32_RESTORE;
case 8:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..4ec13807dc4d8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1002,6 +1002,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
} // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM]
}
+defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>;
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 924aa45559366..f3e1f183e8836 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1280,6 +1280,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
+ case AMDGPU::SI_SPILL_V16_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -2347,6 +2349,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A384_SAVE:
@@ -2387,8 +2390,14 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
+ Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ }
+
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
@@ -2406,6 +2415,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
+ case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
@@ -2455,8 +2465,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
+ Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ }
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0dd3ad2d..6a92e54b69edc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2483,6 +2483,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
+ case AMDGPU::VGPR_16RegClassID:
+ case AMDGPU::VGPR_16_Lo128RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
case AMDGPU::AGPR_LO16RegClassID:
return 16;
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
new file mode 100644
index 0000000000000..0e45df223465d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+
+define void @spill_i16_alu() {
+; GCN-TRUE16-LABEL: spill_i16_alu:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_alu_two_vals() {
+; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v1, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+ %alloca2 = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+ %badd = add i16 %b, 123
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+ %outptr2 = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ store volatile i16 %badd, ptr addrspace(5) %outptr2
+
+ ret void
+}
+
+; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
+
+define void @spill_i16() {
+; GCN-TRUE16-LABEL: spill_i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_half() {
+; GCN-TRUE16-LABEL: spill_half:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_half:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca half, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile half, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ store volatile half %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+ %boutptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %b, ptr addrspace(5) %boutptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16_one_free_reg() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc...
[truncated]
|
CI failure is not related |
10f29f4
to
38a4f38
Compare
: AMDGPU::BUFFER_STORE_DWORD_OFFSET; | ||
unsigned Opc; | ||
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) { | ||
Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the mubuf ABI does not work with true16? Or is this todo? Can you assert ST.enableFlatScratch at least?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure about this part. Might need some input from @Sisyph when he is back from vacation.
Added an assert first
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there is a conceptual issue here, just we did not implement T16 BUF instructions or spilling using them yet. We have a ticket to track it. Do you have any idea of the priority of the BUF spilling? It looks like all the latest subtargets have ArchitectedFlatScratch and so use the scratch path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't remember all the details, but I thought gfx11 still defaulted to mubuf path? I vaguely remember a few rounds of the addressing modes not actually working and/or being as good as mubuf with scratch
26b979b
to
2adee2f
Compare
Fixed a typo and rebased |
2adee2f
to
8d19d36
Compare
rebased again |
Enables 16-bit values to be spilled to scratch.
Note, the memory instructions used are defined as reading and writing VGPR_32, but do not clobber the unspecified 16-bits of those registers, and so spills and reloads of lo and hi halves of the registers work.