@@ -112,9 +112,9 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
112
112
using RegInterval = std::pair<signed , signed >;
113
113
114
114
struct {
115
- int32_t VmcntMax;
116
- int32_t ExpcntMax;
117
- int32_t LgkmcntMax;
115
+ uint32_t VmcntMax;
116
+ uint32_t ExpcntMax;
117
+ uint32_t LgkmcntMax;
118
118
int32_t NumVGPRsMax;
119
119
int32_t NumSGPRsMax;
120
120
} HardwareLimits;
@@ -194,7 +194,7 @@ class BlockWaitcntBrackets {
194
194
195
195
~BlockWaitcntBrackets () = default ;
196
196
197
- static int32_t getWaitCountMax (InstCounterType T) {
197
+ static uint32_t getWaitCountMax (InstCounterType T) {
198
198
switch (T) {
199
199
case VM_CNT:
200
200
return HardwareLimits.VmcntMax ;
@@ -208,33 +208,33 @@ class BlockWaitcntBrackets {
208
208
return 0 ;
209
209
}
210
210
211
- void setScoreLB (InstCounterType T, int32_t Val) {
211
+ void setScoreLB (InstCounterType T, uint32_t Val) {
212
212
assert (T < NUM_INST_CNTS);
213
213
if (T >= NUM_INST_CNTS)
214
214
return ;
215
215
ScoreLBs[T] = Val;
216
216
}
217
217
218
- void setScoreUB (InstCounterType T, int32_t Val) {
218
+ void setScoreUB (InstCounterType T, uint32_t Val) {
219
219
assert (T < NUM_INST_CNTS);
220
220
if (T >= NUM_INST_CNTS)
221
221
return ;
222
222
ScoreUBs[T] = Val;
223
223
if (T == EXP_CNT) {
224
- int32_t UB = ( int )( ScoreUBs[T] - getWaitCountMax (EXP_CNT) );
225
- if (ScoreLBs[T] < UB)
224
+ uint32_t UB = ScoreUBs[T] - getWaitCountMax (EXP_CNT);
225
+ if (ScoreLBs[T] < UB && UB < ScoreUBs[T] )
226
226
ScoreLBs[T] = UB;
227
227
}
228
228
}
229
229
230
- int32_t getScoreLB (InstCounterType T) const {
230
+ uint32_t getScoreLB (InstCounterType T) const {
231
231
assert (T < NUM_INST_CNTS);
232
232
if (T >= NUM_INST_CNTS)
233
233
return 0 ;
234
234
return ScoreLBs[T];
235
235
}
236
236
237
- int32_t getScoreUB (InstCounterType T) const {
237
+ uint32_t getScoreUB (InstCounterType T) const {
238
238
assert (T < NUM_INST_CNTS);
239
239
if (T >= NUM_INST_CNTS)
240
240
return 0 ;
@@ -251,7 +251,7 @@ class BlockWaitcntBrackets {
251
251
return EXP_CNT;
252
252
}
253
253
254
- void setRegScore (int GprNo, InstCounterType T, int32_t Val) {
254
+ void setRegScore (int GprNo, InstCounterType T, uint32_t Val) {
255
255
if (GprNo < NUM_ALL_VGPRS) {
256
256
if (GprNo > VgprUB) {
257
257
VgprUB = GprNo;
@@ -266,7 +266,7 @@ class BlockWaitcntBrackets {
266
266
}
267
267
}
268
268
269
- int32_t getRegScore (int GprNo, InstCounterType T) {
269
+ uint32_t getRegScore (int GprNo, InstCounterType T) {
270
270
if (GprNo < NUM_ALL_VGPRS) {
271
271
return VgprScores[T][GprNo];
272
272
}
@@ -291,15 +291,15 @@ class BlockWaitcntBrackets {
291
291
292
292
void setExpScore (const MachineInstr *MI, const SIInstrInfo *TII,
293
293
const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
294
- unsigned OpNo, int32_t Val);
294
+ unsigned OpNo, uint32_t Val);
295
295
296
296
int32_t getMaxVGPR () const { return VgprUB; }
297
297
int32_t getMaxSGPR () const { return SgprUB; }
298
298
299
299
bool counterOutOfOrder (InstCounterType T) const ;
300
300
bool simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
301
301
bool simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
302
- void determineWait (InstCounterType T, int ScoreToWait,
302
+ void determineWait (InstCounterType T, uint32_t ScoreToWait,
303
303
AMDGPU::Waitcnt &Wait) const ;
304
304
void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
305
305
void applyWaitcnt (InstCounterType T, unsigned Count);
@@ -342,19 +342,19 @@ class BlockWaitcntBrackets {
342
342
const GCNSubtarget *ST = nullptr ;
343
343
bool RevisitLoop = false ;
344
344
int32_t PostOrder = 0 ;
345
- int32_t ScoreLBs[NUM_INST_CNTS] = {0 };
346
- int32_t ScoreUBs[NUM_INST_CNTS] = {0 };
345
+ uint32_t ScoreLBs[NUM_INST_CNTS] = {0 };
346
+ uint32_t ScoreUBs[NUM_INST_CNTS] = {0 };
347
347
uint32_t PendingEvents = 0 ;
348
348
bool MixedPendingEvents[NUM_INST_CNTS] = {false };
349
349
// Remember the last flat memory operation.
350
- int32_t LastFlat[NUM_INST_CNTS] = {0 };
350
+ uint32_t LastFlat[NUM_INST_CNTS] = {0 };
351
351
// wait_cnt scores for every vgpr.
352
352
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
353
353
int32_t VgprUB = 0 ;
354
354
int32_t SgprUB = 0 ;
355
- int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
355
+ uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
356
356
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
357
- int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0 };
357
+ uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0 };
358
358
};
359
359
360
360
// This is a per-loop-region object that records waitcnt status at the end of
@@ -527,7 +527,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
527
527
const SIInstrInfo *TII,
528
528
const SIRegisterInfo *TRI,
529
529
const MachineRegisterInfo *MRI,
530
- unsigned OpNo, int32_t Val) {
530
+ unsigned OpNo, uint32_t Val) {
531
531
RegInterval Interval = getRegInterval (MI, TII, MRI, TRI, OpNo, false );
532
532
LLVM_DEBUG ({
533
533
const MachineOperand &Opnd = MI->getOperand (OpNo);
@@ -544,7 +544,9 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
544
544
WaitEventType E, MachineInstr &Inst) {
545
545
const MachineRegisterInfo &MRIA = *MRI;
546
546
InstCounterType T = eventCounter (E);
547
- int32_t CurrScore = getScoreUB (T) + 1 ;
547
+ uint32_t CurrScore = getScoreUB (T) + 1 ;
548
+ if (CurrScore == 0 )
549
+ report_fatal_error (" InsertWaitcnt score wraparound" );
548
550
// PendingEvents and ScoreUB need to be update regardless if this event
549
551
// changes the score of a register or not.
550
552
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
@@ -683,8 +685,8 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
683
685
void BlockWaitcntBrackets::print (raw_ostream &OS) {
684
686
OS << ' \n ' ;
685
687
for (auto T : inst_counter_types ()) {
686
- int LB = getScoreLB (T);
687
- int UB = getScoreUB (T);
688
+ uint32_t LB = getScoreLB (T);
689
+ uint32_t UB = getScoreUB (T);
688
690
689
691
switch (T) {
690
692
case VM_CNT:
@@ -704,10 +706,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
704
706
if (LB < UB) {
705
707
// Print vgpr scores.
706
708
for (int J = 0 ; J <= getMaxVGPR (); J++) {
707
- int RegScore = getRegScore (J, T);
709
+ uint32_t RegScore = getRegScore (J, T);
708
710
if (RegScore <= LB)
709
711
continue ;
710
- int RelScore = RegScore - LB - 1 ;
712
+ uint32_t RelScore = RegScore - LB - 1 ;
711
713
if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
712
714
OS << RelScore << " :v" << J << " " ;
713
715
} else {
@@ -717,10 +719,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
717
719
// Also need to print sgpr scores for lgkm_cnt.
718
720
if (T == LGKM_CNT) {
719
721
for (int J = 0 ; J <= getMaxSGPR (); J++) {
720
- int RegScore = getRegScore (J + NUM_ALL_VGPRS, LGKM_CNT);
722
+ uint32_t RegScore = getRegScore (J + NUM_ALL_VGPRS, LGKM_CNT);
721
723
if (RegScore <= LB)
722
724
continue ;
723
- int RelScore = RegScore - LB - 1 ;
725
+ uint32_t RelScore = RegScore - LB - 1 ;
724
726
OS << RelScore << " :s" << J << " " ;
725
727
}
726
728
}
@@ -740,30 +742,22 @@ bool BlockWaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
740
742
741
743
bool BlockWaitcntBrackets::simplifyWaitcnt (InstCounterType T,
742
744
unsigned &Count) const {
743
- const int32_t LB = getScoreLB (T);
744
- const int32_t UB = getScoreUB (T);
745
- if (Count < ( unsigned ) UB && UB - ( int32_t ) Count > LB)
745
+ const uint32_t LB = getScoreLB (T);
746
+ const uint32_t UB = getScoreUB (T);
747
+ if (Count < UB && UB - Count > LB)
746
748
return true ;
747
749
748
750
Count = ~0u ;
749
751
return false ;
750
752
}
751
753
752
- void BlockWaitcntBrackets::determineWait (InstCounterType T, int ScoreToWait,
754
+ void BlockWaitcntBrackets::determineWait (InstCounterType T,
755
+ uint32_t ScoreToWait,
753
756
AMDGPU::Waitcnt &Wait) const {
754
- if (ScoreToWait == -1 ) {
755
- // The score to wait is unknown. This implies that it was not encountered
756
- // during the path of the CFG walk done during the current traversal but
757
- // may be seen on a different path. Emit an s_wait counter with a
758
- // conservative value of 0 for the counter.
759
- addWait (Wait, T, 0 );
760
- return ;
761
- }
762
-
763
757
// If the score of src_operand falls within the bracket, we need an
764
758
// s_waitcnt instruction.
765
- const int32_t LB = getScoreLB (T);
766
- const int32_t UB = getScoreUB (T);
759
+ const uint32_t LB = getScoreLB (T);
760
+ const uint32_t UB = getScoreUB (T);
767
761
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
768
762
if ((T == VM_CNT || T == LGKM_CNT) &&
769
763
hasPendingFlat () &&
@@ -790,13 +784,13 @@ void BlockWaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
790
784
}
791
785
792
786
void BlockWaitcntBrackets::applyWaitcnt (InstCounterType T, unsigned Count) {
793
- const int32_t UB = getScoreUB (T);
794
- if (Count >= ( unsigned ) UB)
787
+ const uint32_t UB = getScoreUB (T);
788
+ if (Count >= UB)
795
789
return ;
796
790
if (Count != 0 ) {
797
791
if (counterOutOfOrder (T))
798
792
return ;
799
- setScoreLB (T, std::max (getScoreLB (T), UB - ( int32_t ) Count));
793
+ setScoreLB (T, std::max (getScoreLB (T), UB - Count));
800
794
} else {
801
795
setScoreLB (T, UB);
802
796
MixedPendingEvents[T] = false ;
@@ -1235,8 +1229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(
1235
1229
// this merged score bracket is used when adding waitcnts to the Block
1236
1230
void SIInsertWaitcnts::mergeInputScoreBrackets (MachineBasicBlock &Block) {
1237
1231
BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get ();
1238
- int32_t MaxPending[NUM_INST_CNTS] = {0 };
1239
- int32_t MaxFlat[NUM_INST_CNTS] = {0 };
1232
+ uint32_t MaxPending[NUM_INST_CNTS] = {0 };
1233
+ uint32_t MaxFlat[NUM_INST_CNTS] = {0 };
1240
1234
1241
1235
// For single basic block loops, we need to retain the Block's
1242
1236
// score bracket to have accurate Pred info. So, make a copy of Block's
@@ -1264,7 +1258,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1264
1258
if (!Visited)
1265
1259
continue ;
1266
1260
for (auto T : inst_counter_types ()) {
1267
- int span =
1261
+ uint32_t span =
1268
1262
PredScoreBrackets->getScoreUB (T) - PredScoreBrackets->getScoreLB (T);
1269
1263
MaxPending[T] = std::max (MaxPending[T], span);
1270
1264
span =
@@ -1291,27 +1285,27 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1291
1285
1292
1286
// Now merge the gpr_reg_score information
1293
1287
for (auto T : inst_counter_types ()) {
1294
- int PredLB = PredScoreBrackets->getScoreLB (T);
1295
- int PredUB = PredScoreBrackets->getScoreUB (T);
1288
+ uint32_t PredLB = PredScoreBrackets->getScoreLB (T);
1289
+ uint32_t PredUB = PredScoreBrackets->getScoreUB (T);
1296
1290
if (PredLB < PredUB) {
1297
- int PredScale = MaxPending[T] - PredUB;
1291
+ uint32_t PredScale = MaxPending[T] - PredUB;
1298
1292
// Merge vgpr scores.
1299
1293
for (int J = 0 ; J <= PredScoreBrackets->getMaxVGPR (); J++) {
1300
- int PredRegScore = PredScoreBrackets->getRegScore (J, T);
1294
+ uint32_t PredRegScore = PredScoreBrackets->getRegScore (J, T);
1301
1295
if (PredRegScore <= PredLB)
1302
1296
continue ;
1303
- int NewRegScore = PredScale + PredRegScore;
1297
+ uint32_t NewRegScore = PredScale + PredRegScore;
1304
1298
ScoreBrackets->setRegScore (
1305
1299
J, T, std::max (ScoreBrackets->getRegScore (J, T), NewRegScore));
1306
1300
}
1307
1301
// Also need to merge sgpr scores for lgkm_cnt.
1308
1302
if (T == LGKM_CNT) {
1309
1303
for (int J = 0 ; J <= PredScoreBrackets->getMaxSGPR (); J++) {
1310
- int PredRegScore =
1304
+ uint32_t PredRegScore =
1311
1305
PredScoreBrackets->getRegScore (J + NUM_ALL_VGPRS, LGKM_CNT);
1312
1306
if (PredRegScore <= PredLB)
1313
1307
continue ;
1314
- int NewRegScore = PredScale + PredRegScore;
1308
+ uint32_t NewRegScore = PredScale + PredRegScore;
1315
1309
ScoreBrackets->setRegScore (
1316
1310
J + NUM_ALL_VGPRS, LGKM_CNT,
1317
1311
std::max (
0 commit comments