Skip to content

Commit 8d0c34f

Browse files
committed
[AMDGPU] Omit unnecessary waitcnt before barriers
It is not necessary to wait for all outstanding memory operations before barriers on hardware that can back off of the barrier in the event of an exception when traps are enabled. Add a new subtarget feature which tracks which HW has this ability. Reviewed By: #amdgpu, rampitec Differential Revision: https://reviews.llvm.org/D120544
1 parent 20c4664 commit 8d0c34f

File tree

6 files changed

+134
-17
lines changed

6 files changed

+134
-17
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,12 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
737737
"Hardware automatically inserts waitcnt before barrier"
738738
>;
739739

740+
def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
741+
"BackOffBarrier",
742+
"true",
743+
"Hardware supports backing off s_barrier if an exception occurs"
744+
>;
745+
740746
def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
741747
"HasTrigReducedRange",
742748
"true",
@@ -1025,7 +1031,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
10251031
FeatureMadMacF32Insts,
10261032
FeatureSupportsSRAMECC,
10271033
FeaturePackedTID,
1028-
FullRate64Ops]>;
1034+
FullRate64Ops,
1035+
FeatureBackOffBarrier]>;
10291036

10301037
def FeatureISAVersion9_0_C : FeatureSet<
10311038
[FeatureGFX9,
@@ -1059,7 +1066,8 @@ def FeatureISAVersion9_4_0 : FeatureSet<
10591066
FeatureSupportsSRAMECC,
10601067
FeaturePackedTID,
10611068
FeatureArchitectedFlatScratch,
1062-
FullRate64Ops]>;
1069+
FullRate64Ops,
1070+
FeatureBackOffBarrier]>;
10631071

10641072
// TODO: Organize more features into groups.
10651073
def FeatureGroup {
@@ -1094,7 +1102,8 @@ def FeatureISAVersion10_1_0 : FeatureSet<
10941102
FeatureMadMacF32Insts,
10951103
FeatureDsSrc2Insts,
10961104
FeatureLdsMisalignedBug,
1097-
FeatureSupportsXNACK])>;
1105+
FeatureSupportsXNACK,
1106+
FeatureBackOffBarrier])>;
10981107

10991108
def FeatureISAVersion10_1_1 : FeatureSet<
11001109
!listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1116,7 +1125,8 @@ def FeatureISAVersion10_1_1 : FeatureSet<
11161125
FeatureMadMacF32Insts,
11171126
FeatureDsSrc2Insts,
11181127
FeatureLdsMisalignedBug,
1119-
FeatureSupportsXNACK])>;
1128+
FeatureSupportsXNACK,
1129+
FeatureBackOffBarrier])>;
11201130

11211131
def FeatureISAVersion10_1_2 : FeatureSet<
11221132
!listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1138,7 +1148,8 @@ def FeatureISAVersion10_1_2 : FeatureSet<
11381148
FeatureMadMacF32Insts,
11391149
FeatureDsSrc2Insts,
11401150
FeatureLdsMisalignedBug,
1141-
FeatureSupportsXNACK])>;
1151+
FeatureSupportsXNACK,
1152+
FeatureBackOffBarrier])>;
11421153

11431154
def FeatureISAVersion10_1_3 : FeatureSet<
11441155
!listconcat(FeatureGroup.GFX10_1_Bugs,
@@ -1156,7 +1167,8 @@ def FeatureISAVersion10_1_3 : FeatureSet<
11561167
FeatureMadMacF32Insts,
11571168
FeatureDsSrc2Insts,
11581169
FeatureLdsMisalignedBug,
1159-
FeatureSupportsXNACK])>;
1170+
FeatureSupportsXNACK,
1171+
FeatureBackOffBarrier])>;
11601172

11611173
def FeatureISAVersion10_3_0 : FeatureSet<
11621174
[FeatureGFX10,
@@ -1173,7 +1185,8 @@ def FeatureISAVersion10_3_0 : FeatureSet<
11731185
FeatureNSAEncoding,
11741186
FeatureNSAMaxSize13,
11751187
FeatureWavefrontSize32,
1176-
FeatureShaderCyclesRegister]>;
1188+
FeatureShaderCyclesRegister,
1189+
FeatureBackOffBarrier]>;
11771190

11781191
//===----------------------------------------------------------------------===//
11791192

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
7272
// Dynamically set bits that enable features.
7373
bool FlatForGlobal;
7474
bool AutoWaitcntBeforeBarrier;
75+
bool BackOffBarrier;
7576
bool UnalignedScratchAccess;
7677
bool UnalignedAccessMode;
7778
bool HasApertureRegs;
@@ -493,6 +494,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
493494
return AutoWaitcntBeforeBarrier;
494495
}
495496

497+
/// \returns true if the target supports backing off of s_barrier instructions
498+
/// when an exception is raised.
499+
bool supportsBackOffBarrier() const {
500+
return BackOffBarrier;
501+
}
502+
496503
bool hasUnalignedBufferAccess() const {
497504
return UnalignedBufferAccess;
498505
}

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,12 +1135,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11351135
}
11361136
}
11371137

1138-
// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1139-
// occurs before the instruction. Doing it here prevents any additional
1140-
// S_WAITCNTs from being emitted if the instruction was marked as
1141-
// requiring a WAITCNT beforehand.
1138+
// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1139+
// not, we need to ensure the subtarget is capable of backing off barrier
1140+
// instructions in case there are any outstanding memory operations that may
1141+
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
11421142
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1143-
!ST->hasAutoWaitcntBeforeBarrier()) {
1143+
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
11441144
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
11451145
}
11461146

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
4+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
5+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s
7+
8+
; Subtargets must wait for outstanding memory instructions before a barrier if
9+
; they cannot back off of the barrier.
10+
11+
define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
12+
; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
13+
; GFX9-NO-BACKOFF: ; %bb.0:
14+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15+
; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
16+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17+
; GFX9-NO-BACKOFF-NEXT: s_barrier
18+
; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
19+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20+
; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
21+
;
22+
; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence:
23+
; GFX9-BACKOFF: ; %bb.0:
24+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25+
; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
26+
; GFX9-BACKOFF-NEXT: s_barrier
27+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
28+
; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
29+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
30+
; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
31+
;
32+
; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
33+
; GFX10-BACKOFF: ; %bb.0:
34+
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
36+
; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
37+
; GFX10-BACKOFF-NEXT: s_barrier
38+
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
39+
; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
40+
; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
41+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
42+
; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
43+
%load = load i32, i32* %in
44+
call void @llvm.amdgcn.s.barrier()
45+
store i32 %load, i32* %out
46+
ret void
47+
}
48+
49+
define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
50+
; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
51+
; GFX9-NO-BACKOFF: ; %bb.0:
52+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53+
; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
54+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
55+
; GFX9-NO-BACKOFF-NEXT: s_barrier
56+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
57+
; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
58+
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
59+
; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
60+
;
61+
; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence:
62+
; GFX9-BACKOFF: ; %bb.0:
63+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64+
; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
65+
; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
66+
; GFX9-BACKOFF-NEXT: s_barrier
67+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
68+
; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
69+
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
70+
; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
71+
;
72+
; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
73+
; GFX10-BACKOFF: ; %bb.0:
74+
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
76+
; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
77+
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
78+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
79+
; GFX10-BACKOFF-NEXT: s_barrier
80+
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
81+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
82+
; GFX10-BACKOFF-NEXT: buffer_gl0_inv
83+
; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
84+
; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
85+
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
86+
; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
87+
%load = load i32, i32* %in
88+
fence syncscope("workgroup") release
89+
call void @llvm.amdgcn.s.barrier()
90+
fence syncscope("workgroup") acquire
91+
store i32 %load, i32* %out
92+
ret void
93+
}
94+
95+
declare void @llvm.amdgcn.s.barrier()
96+
97+
attributes #0 = { nounwind }

llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ body: |
3535
; GFX10: S_WAITCNT 0
3636
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
3737
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
38-
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
38+
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
3939
; GFX10: S_BARRIER
4040
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
4141
; GFX10: S_WAITCNT 112
@@ -112,7 +112,7 @@ body: |
112112
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
113113
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
114114
; GFX10: S_WAITCNT 0
115-
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
115+
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
116116
; GFX10: S_BARRIER
117117
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
118118
; GFX10: S_WAITCNT 112

llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
22
; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
3-
; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
44

55
; GCN-LABEL: barrier_vmcnt_global:
66
; GFX8: flat_load_dword
@@ -42,7 +42,7 @@ bb:
4242
%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
4343
store i32 0, i32 addrspace(1)* %tmp5, align 4
4444
fence syncscope("singlethread") release
45-
tail call void @llvm.amdgcn.s.barrier() #3
45+
tail call void @llvm.amdgcn.s.barrier()
4646
fence syncscope("singlethread") acquire
4747
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
4848
%tmp7 = lshr exact i64 %tmp6, 32
@@ -116,7 +116,7 @@ bb:
116116
%tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
117117
store i32 0, i32* %tmp5, align 4
118118
fence syncscope("singlethread") release
119-
tail call void @llvm.amdgcn.s.barrier() #3
119+
tail call void @llvm.amdgcn.s.barrier()
120120
fence syncscope("singlethread") acquire
121121
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
122122
%tmp7 = lshr exact i64 %tmp6, 32

0 commit comments

Comments
 (0)