Skip to content

Commit 14fcd81

Browse files
authored
[AMDGPU][InsertWaitCnts] Refactor some helper functions, NFC (#161160)
- Remove one-line wrappers around a simple function call when they're only used once or twice. - Move very generic helpers into SIInstrInfo - Delete unused functions The goal is simply to reduce the noise in SIInsertWaitCnts without hiding functionality. I focused on moving trivial helpers, or helpers with very descriptive/verbose names (so it doesn't hide too much logic away from the pass), and that have some reusability potential. I'm also trying to make the code style more consistent. It doesn't make sense to see a function call `TII->isXXX` then suddenly call a random `isY` method that just wraps around `TII->isY`. The context of this work is that I'm trying to learn how this pass works, and while going through the code I noticed some little things here and there that I thought would be good to fix.
1 parent f80e7e1 commit 14fcd81

File tree

3 files changed

+89
-102
lines changed

3 files changed

+89
-102
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 13 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -495,13 +495,6 @@ class SIInsertWaitcnts {
495495
bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
496496
bool run(MachineFunction &MF);
497497

498-
bool isForceEmitWaitcnt() const {
499-
for (auto T : inst_counter_types())
500-
if (ForceEmitWaitcnt[T])
501-
return true;
502-
return false;
503-
}
504-
505498
void setForceEmitWaitcnt() {
506499
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
507500
// For debug builds, get the debug counter info and adjust if need be
@@ -570,10 +563,6 @@ class SIInsertWaitcnts {
570563
return VmemReadMapping[getVmemType(Inst)];
571564
}
572565

573-
bool hasXcnt() const { return ST->hasWaitXCnt(); }
574-
575-
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
576-
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
577566
bool isVmemAccess(const MachineInstr &MI) const;
578567
bool generateWaitcntInstBefore(MachineInstr &MI,
579568
WaitcntBrackets &ScoreBrackets,
@@ -591,7 +580,6 @@ class SIInsertWaitcnts {
591580
WaitcntBrackets &ScoreBrackets);
592581
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
593582
WaitcntBrackets &ScoreBrackets);
594-
static bool asynchronouslyWritesSCC(unsigned Opcode);
595583
};
596584

597585
// This objects maintains the current score brackets of each wait counter, and
@@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
11091097
setRegScore(FIRST_LDS_VGPR, T, CurrScore);
11101098
}
11111099

1112-
if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
1100+
if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
11131101
setRegScore(SCC, T, CurrScore);
11141102
PendingSCCWrite = &Inst;
11151103
}
@@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
18311819
return Modified;
18321820
}
18331821

1834-
static bool readsVCCZ(const MachineInstr &MI) {
1835-
unsigned Opc = MI.getOpcode();
1836-
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1837-
!MI.getOperand(1).isUndef();
1838-
}
1839-
18401822
/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
18411823
static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
18421824
// Currently all conventions wait, but this may not always be the case.
@@ -2061,7 +2043,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20612043
ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
20622044
}
20632045

2064-
if (hasXcnt() && Op.isDef())
2046+
if (ST->hasWaitXCnt() && Op.isDef())
20652047
ScoreBrackets.determineWait(X_CNT, Interval, Wait);
20662048
}
20672049
}
@@ -2087,10 +2069,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20872069
// TODO: Remove this work-around, enable the assert for Bug 457939
20882070
// after fixing the scheduler. Also, the Shader Compiler code is
20892071
// independent of target.
2090-
if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
2091-
if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2092-
Wait.DsCnt = 0;
2093-
}
2072+
if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2073+
ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2074+
Wait.DsCnt = 0;
20942075
}
20952076

20962077
// Verify that the wait is actually needed.
@@ -2185,75 +2166,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21852166
return Modified;
21862167
}
21872168

2188-
// This is a flat memory operation. Check to see if it has memory tokens other
2189-
// than LDS. Other address spaces supported by flat memory operations involve
2190-
// global memory.
2191-
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
2192-
assert(TII->isFLAT(MI));
2193-
2194-
// All flat instructions use the VMEM counter except prefetch.
2195-
if (!TII->usesVM_CNT(MI))
2196-
return false;
2197-
2198-
// If there are no memory operands then conservatively assume the flat
2199-
// operation may access VMEM.
2200-
if (MI.memoperands_empty())
2201-
return true;
2202-
2203-
// See if any memory operand specifies an address space that involves VMEM.
2204-
// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
2205-
// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
2206-
// (GDS) address space is not supported by flat operations. Therefore, simply
2207-
// return true unless only the LDS address space is found.
2208-
for (const MachineMemOperand *Memop : MI.memoperands()) {
2209-
unsigned AS = Memop->getAddrSpace();
2210-
assert(AS != AMDGPUAS::REGION_ADDRESS);
2211-
if (AS != AMDGPUAS::LOCAL_ADDRESS)
2212-
return true;
2213-
}
2214-
2215-
return false;
2216-
}
2217-
2218-
// This is a flat memory operation. Check to see if it has memory tokens for
2219-
// either LDS or FLAT.
2220-
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
2221-
assert(TII->isFLAT(MI));
2222-
2223-
// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
2224-
if (!TII->usesLGKM_CNT(MI))
2225-
return false;
2226-
2227-
// If in tgsplit mode then there can be no use of LDS.
2228-
if (ST->isTgSplitEnabled())
2229-
return false;
2230-
2231-
// If there are no memory operands then conservatively assume the flat
2232-
// operation may access LDS.
2233-
if (MI.memoperands_empty())
2234-
return true;
2235-
2236-
// See if any memory operand specifies an address space that involves LDS.
2237-
for (const MachineMemOperand *Memop : MI.memoperands()) {
2238-
unsigned AS = Memop->getAddrSpace();
2239-
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2240-
return true;
2241-
}
2242-
2243-
return false;
2244-
}
2245-
22462169
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2247-
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
2170+
return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
22482171
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
22492172
}
22502173

2251-
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
2252-
auto Opc = Inst.getOpcode();
2253-
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2254-
Opc == AMDGPU::GLOBAL_WBINV;
2255-
}
2256-
22572174
// Return true if the next instruction is S_ENDPGM, following fallthrough
22582175
// blocks if necessary.
22592176
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -2331,7 +2248,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23312248
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
23322249
}
23332250
} else if (TII->isFLAT(Inst)) {
2334-
if (isGFX12CacheInvOrWBInst(Inst)) {
2251+
if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
23352252
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
23362253
Inst);
23372254
return;
@@ -2341,14 +2258,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23412258

23422259
int FlatASCount = 0;
23432260

2344-
if (mayAccessVMEMThroughFlat(Inst)) {
2261+
if (TII->mayAccessVMEMThroughFlat(Inst)) {
23452262
++FlatASCount;
23462263
IsVMEMAccess = true;
23472264
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
23482265
Inst);
23492266
}
23502267

2351-
if (mayAccessLDSThroughFlat(Inst)) {
2268+
if (TII->mayAccessLDSThroughFlat(Inst)) {
23522269
++FlatASCount;
23532270
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
23542271
}
@@ -2394,7 +2311,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
23942311
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
23952312
else
23962313
ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2397-
} else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
2314+
} else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
23982315
ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
23992316
} else {
24002317
switch (Inst.getOpcode()) {
@@ -2413,7 +2330,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
24132330
}
24142331
}
24152332

2416-
if (!hasXcnt())
2333+
if (!ST->hasWaitXCnt())
24172334
return;
24182335

24192336
if (IsVMEMAccess)
@@ -2516,12 +2433,6 @@ static bool isWaitInstr(MachineInstr &Inst) {
25162433
counterTypeForInstr(Opcode).has_value();
25172434
}
25182435

2519-
bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
2520-
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
2521-
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
2522-
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
2523-
}
2524-
25252436
// Generate s_waitcnt instructions where needed.
25262437
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25272438
MachineBasicBlock &Block,
@@ -2578,7 +2489,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
25782489
OldWaitcntInstr = nullptr;
25792490

25802491
// Restore vccz if it's not known to be correct already.
2581-
bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2492+
bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
25822493

25832494
// Don't examine operands unless we need to track vccz correctness.
25842495
if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
@@ -2701,7 +2612,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
27012612

27022613
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
27032614
if (SIInstrInfo::isFLAT(MI))
2704-
return mayAccessVMEMThroughFlat(MI);
2615+
return TII->mayAccessVMEMThroughFlat(MI);
27052616
return SIInstrInfo::isVMEM(MI);
27062617
}
27072618

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
43444344
});
43454345
}
43464346

4347+
bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
4348+
assert(isFLAT(MI));
4349+
4350+
// All flat instructions use the VMEM counter except prefetch.
4351+
if (!usesVM_CNT(MI))
4352+
return false;
4353+
4354+
// If there are no memory operands then conservatively assume the flat
4355+
// operation may access VMEM.
4356+
if (MI.memoperands_empty())
4357+
return true;
4358+
4359+
// See if any memory operand specifies an address space that involves VMEM.
4360+
// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
4361+
// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
4362+
// (GDS) address space is not supported by flat operations. Therefore, simply
4363+
// return true unless only the LDS address space is found.
4364+
for (const MachineMemOperand *Memop : MI.memoperands()) {
4365+
unsigned AS = Memop->getAddrSpace();
4366+
assert(AS != AMDGPUAS::REGION_ADDRESS);
4367+
if (AS != AMDGPUAS::LOCAL_ADDRESS)
4368+
return true;
4369+
}
4370+
4371+
return false;
4372+
}
4373+
4374+
bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
4375+
assert(isFLAT(MI));
4376+
4377+
// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
4378+
if (!usesLGKM_CNT(MI))
4379+
return false;
4380+
4381+
// If in tgsplit mode then there can be no use of LDS.
4382+
if (ST.isTgSplitEnabled())
4383+
return false;
4384+
4385+
// If there are no memory operands then conservatively assume the flat
4386+
// operation may access LDS.
4387+
if (MI.memoperands_empty())
4388+
return true;
4389+
4390+
// See if any memory operand specifies an address space that involves LDS.
4391+
for (const MachineMemOperand *Memop : MI.memoperands()) {
4392+
unsigned AS = Memop->getAddrSpace();
4393+
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
4394+
return true;
4395+
}
4396+
4397+
return false;
4398+
}
4399+
43474400
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
43484401
// Skip the full operand and register alias search modifiesRegister
43494402
// does. There's only a handful of instructions that touch this, it's only an

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
688688
/// to not hit scratch.
689689
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
690690

691+
/// \returns true for FLAT instructions that can access VMEM.
692+
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
693+
694+
/// \returns true for FLAT instructions that can access LDS.
695+
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
696+
691697
static bool isBlockLoadStore(uint16_t Opcode) {
692698
switch (Opcode) {
693699
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
@@ -748,6 +754,18 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
748754
return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
749755
}
750756

757+
static bool isSBarrierSCCWrite(unsigned Opcode) {
758+
return Opcode == AMDGPU::S_BARRIER_LEAVE ||
759+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
760+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
761+
}
762+
763+
static bool isCBranchVCCZRead(const MachineInstr &MI) {
764+
unsigned Opc = MI.getOpcode();
765+
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
766+
!MI.getOperand(1).isUndef();
767+
}
768+
751769
static bool isWQM(const MachineInstr &MI) {
752770
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
753771
}
@@ -1010,6 +1028,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10101028
Opcode == AMDGPU::DS_GWS_BARRIER;
10111029
}
10121030

1031+
static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
1032+
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1033+
Opc == AMDGPU::GLOBAL_WBINV;
1034+
}
1035+
10131036
static bool isF16PseudoScalarTrans(unsigned Opcode) {
10141037
return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
10151038
Opcode == AMDGPU::V_S_LOG_F16_e64 ||

0 commit comments

Comments
 (0)