Skip to content

Commit b8c2867

Browse files
committed
Reapply "[AMDGPU] Support block load/store for CSR" (llvm#136846)
This reverts commit 6bb2f90.
1 parent e35cc2d commit b8c2867

19 files changed

+1065
-41
lines changed

llvm/include/llvm/CodeGen/MachineFrameInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class CalleeSavedInfo {
6161
MCRegister getReg() const { return Reg; }
6262
int getFrameIdx() const { return FrameIdx; }
6363
MCRegister getDstReg() const { return DstReg; }
64+
void setReg(MCRegister R) { Reg = R; }
6465
void setFrameIdx(int FI) {
6566
FrameIdx = FI;
6667
SpilledToReg = false;

llvm/include/llvm/CodeGen/TargetFrameLowering.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,14 @@ class TargetFrameLowering {
270270
return false;
271271
}
272272

273+
/// spillCalleeSavedRegister - Default implementation for spilling a single
274+
/// callee saved register.
275+
void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock,
276+
MachineBasicBlock::iterator MI,
277+
const CalleeSavedInfo &CS,
278+
const TargetInstrInfo *TII,
279+
const TargetRegisterInfo *TRI) const;
280+
273281
/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
274282
/// saved registers and returns true if it isn't possible / profitable to do
275283
/// so by issuing a series of load instructions via loadRegToStackSlot().
@@ -284,6 +292,15 @@ class TargetFrameLowering {
284292
return false;
285293
}
286294

295+
// restoreCalleeSavedRegister - Default implementation for restoring a single
296+
// callee saved register. Should be called in reverse order. Can insert
297+
// multiple instructions.
298+
void restoreCalleeSavedRegister(MachineBasicBlock &MBB,
299+
MachineBasicBlock::iterator MI,
300+
const CalleeSavedInfo &CS,
301+
const TargetInstrInfo *TII,
302+
const TargetRegisterInfo *TRI) const;
303+
287304
/// hasFP - Return true if the specified function should have a dedicated
288305
/// frame pointer register. For most targets this is true only if the function
289306
/// has variable sized allocas or if frame pointer elimination is disabled.

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 6 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
476476
// Now that we know which registers need to be saved and restored, allocate
477477
// stack slots for them.
478478
for (auto &CS : CSI) {
479-
// If the target has spilled this register to another register, we don't
480-
// need to allocate a stack slot.
479+
// If the target has spilled this register to another register or already
480+
// handled it , we don't need to allocate a stack slot.
481481
if (CS.isSpilledToReg())
482482
continue;
483483

@@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) {
597597
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
598598
ArrayRef<CalleeSavedInfo> CSI) {
599599
MachineFunction &MF = *SaveBlock.getParent();
600-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
600+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
601601
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
602602
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
603603

604604
MachineBasicBlock::iterator I = SaveBlock.begin();
605605
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
606606
for (const CalleeSavedInfo &CS : CSI) {
607-
// Insert the spill to the stack frame.
608-
MCRegister Reg = CS.getReg();
609-
610-
if (CS.isSpilledToReg()) {
611-
BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY),
612-
CS.getDstReg())
613-
.addReg(Reg, getKillRegState(true));
614-
} else {
615-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
616-
TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
617-
TRI, Register());
618-
}
607+
TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI);
619608
}
620609
}
621610
}
@@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
624613
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
625614
std::vector<CalleeSavedInfo> &CSI) {
626615
MachineFunction &MF = *RestoreBlock.getParent();
627-
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
616+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
628617
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
629618
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
630619

@@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
634623

635624
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
636625
for (const CalleeSavedInfo &CI : reverse(CSI)) {
637-
MCRegister Reg = CI.getReg();
638-
if (CI.isSpilledToReg()) {
639-
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
640-
.addReg(CI.getDstReg(), getKillRegState(true));
641-
} else {
642-
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
643-
TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
644-
TRI, Register());
645-
assert(I != RestoreBlock.begin() &&
646-
"loadRegFromStackSlot didn't insert any code!");
647-
// Insert in reverse order. loadRegFromStackSlot can insert
648-
// multiple instructions.
649-
}
626+
TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI);
650627
}
651628
}
652629
}

llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "llvm/CodeGen/MachineFunction.h"
1616
#include "llvm/CodeGen/MachineRegisterInfo.h"
1717
#include "llvm/CodeGen/TargetFrameLowering.h"
18+
#include "llvm/CodeGen/TargetInstrInfo.h"
1819
#include "llvm/CodeGen/TargetSubtargetInfo.h"
1920
#include "llvm/IR/Attributes.h"
2021
#include "llvm/IR/Function.h"
@@ -182,3 +183,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
182183
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
183184
return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}};
184185
}
186+
187+
void TargetFrameLowering::spillCalleeSavedRegister(
188+
MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI,
189+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
190+
const TargetRegisterInfo *TRI) const {
191+
// Insert the spill to the stack frame.
192+
MCRegister Reg = CS.getReg();
193+
194+
if (CS.isSpilledToReg()) {
195+
BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY),
196+
CS.getDstReg())
197+
.addReg(Reg, getKillRegState(true));
198+
} else {
199+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
200+
TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
201+
TRI, Register());
202+
}
203+
}
204+
205+
void TargetFrameLowering::restoreCalleeSavedRegister(
206+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
207+
const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
208+
const TargetRegisterInfo *TRI) const {
209+
MCRegister Reg = CS.getReg();
210+
if (CS.isSpilledToReg()) {
211+
BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
212+
.addReg(CS.getDstReg(), getKillRegState(true));
213+
} else {
214+
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
215+
TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
216+
Register());
217+
assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
218+
}
219+
}

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32
12751275
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
12761276
>;
12771277

1278+
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
1279+
// restoring the callee-saved registers.
1280+
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
1281+
"UseBlockVGPROpsForCSR",
1282+
"true",
1283+
"Use block load/store for VGPR callee saved registers"
1284+
>;
1285+
12781286
def FeatureLshlAddU64Inst
12791287
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
12801288
"Has v_lshl_add_u64 instruction">;

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2020
#include "MCTargetDesc/AMDGPUMCExpr.h"
2121
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "SIMachineFunctionInfo.h"
2223
#include "llvm/CodeGen/MachineBasicBlock.h"
2324
#include "llvm/CodeGen/MachineInstr.h"
2425
#include "llvm/IR/Constants.h"
@@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
243244
return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
244245
}
245246

247+
static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
248+
const TargetRegisterInfo *TRI,
249+
const SIMachineFunctionInfo *MFI,
250+
MCStreamer &OS) {
251+
// The instruction will only transfer a subset of the registers in the block,
252+
// based on the mask that is stored in m0. We could search for the instruction
253+
// that sets m0, but most of the time we'll already have the mask stored in
254+
// the machine function info. Try to use that. This assumes that we only use
255+
// block loads/stores for CSR spills.
256+
Register RegBlock =
257+
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
258+
: AMDGPU::OpName::vdata)
259+
->getReg();
260+
Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0);
261+
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
262+
263+
if (!Mask)
264+
return; // Nothing to report
265+
266+
SmallString<512> TransferredRegs;
267+
for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) {
268+
if (Mask & (1 << I)) {
269+
(llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I))
270+
.toVector(TransferredRegs);
271+
}
272+
}
273+
274+
OS.emitRawComment(" transferring at most " + TransferredRegs);
275+
}
276+
246277
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
247278
// FIXME: Enable feature predicate checks once all the test pass.
248279
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -331,6 +362,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
331362
return;
332363
}
333364

365+
if (isVerbose())
366+
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
367+
emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
368+
MF->getInfo<SIMachineFunctionInfo>(),
369+
*OutStreamer);
370+
334371
MCInst TmpInst;
335372
MCInstLowering.lower(MI, TmpInst);
336373
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
262262
bool HasPointSampleAccel = false;
263263

264264
bool RequiresCOV6 = false;
265+
bool UseBlockVGPROpsForCSR = false;
265266

266267
// Dummy feature to use for assembler in tablegen.
267268
bool FeatureDisable = false;
@@ -1277,6 +1278,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12771278

12781279
bool requiresCodeObjectV6() const { return RequiresCOV6; }
12791280

1281+
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1282+
12801283
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12811284

12821285
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }

0 commit comments

Comments
 (0)