Skip to content

Commit

Permalink
[AMDGPU] Omit unnecessary waitcnt before barriers
Browse files Browse the repository at this point in the history
It is not necessary to wait for all outstanding memory operations before
barriers on hardware that can back off of the barrier in the event of an
exception when traps are enabled. Add a new subtarget feature which
tracks which HW has this ability.

Reviewed By: #amdgpu, rampitec

Differential Revision: https://reviews.llvm.org/D130722
  • Loading branch information
kerbowa committed Jul 29, 2022
1 parent 2063b5e commit 2c82a12
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 39 deletions.
30 changes: 22 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,12 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
"Hardware automatically inserts waitcnt before barrier"
>;

def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier",
"BackOffBarrier",
"true",
"Hardware supports backing off s_barrier if an exception occurs"
>;

def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
"HasTrigReducedRange",
"true",
Expand Down Expand Up @@ -1101,7 +1107,8 @@ def FeatureISAVersion9_0_A : FeatureSet<
FeatureMadMacF32Insts,
FeatureSupportsSRAMECC,
FeaturePackedTID,
FullRate64Ops]>;
FullRate64Ops,
FeatureBackOffBarrier]>;

def FeatureISAVersion9_0_C : FeatureSet<
[FeatureGFX9,
Expand Down Expand Up @@ -1138,7 +1145,8 @@ def FeatureISAVersion9_4_0 : FeatureSet<
FeatureSupportsSRAMECC,
FeaturePackedTID,
FeatureArchitectedFlatScratch,
FullRate64Ops]>;
FullRate64Ops,
FeatureBackOffBarrier]>;

// TODO: Organize more features into groups.
def FeatureGroup {
Expand Down Expand Up @@ -1173,7 +1181,8 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureSupportsXNACK])>;
FeatureSupportsXNACK,
FeatureBackOffBarrier])>;

def FeatureISAVersion10_1_1 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
Expand All @@ -1195,7 +1204,8 @@ def FeatureISAVersion10_1_1 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureSupportsXNACK])>;
FeatureSupportsXNACK,
FeatureBackOffBarrier])>;

def FeatureISAVersion10_1_2 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
Expand All @@ -1217,7 +1227,8 @@ def FeatureISAVersion10_1_2 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureSupportsXNACK])>;
FeatureSupportsXNACK,
FeatureBackOffBarrier])>;

def FeatureISAVersion10_1_3 : FeatureSet<
!listconcat(FeatureGroup.GFX10_1_Bugs,
Expand All @@ -1235,7 +1246,8 @@ def FeatureISAVersion10_1_3 : FeatureSet<
FeatureMadMacF32Insts,
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureSupportsXNACK])>;
FeatureSupportsXNACK,
FeatureBackOffBarrier])>;

def FeatureISAVersion10_3_0 : FeatureSet<
[FeatureGFX10,
Expand All @@ -1252,7 +1264,8 @@ def FeatureISAVersion10_3_0 : FeatureSet<
FeatureNSAEncoding,
FeatureNSAMaxSize13,
FeatureWavefrontSize32,
FeatureShaderCyclesRegister]>;
FeatureShaderCyclesRegister,
FeatureBackOffBarrier]>;

def FeatureISAVersion11_Common : FeatureSet<
[FeatureGFX11,
Expand All @@ -1270,7 +1283,8 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureAtomicFaddNoRtnInsts,
FeatureImageInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard]>;
FeatureVcmpxPermlaneHazard,
FeatureBackOffBarrier]>;

def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// Dynamically set bits that enable features.
bool FlatForGlobal = false;
bool AutoWaitcntBeforeBarrier = false;
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
bool HasApertureRegs = false;
Expand Down Expand Up @@ -506,6 +507,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return AutoWaitcntBeforeBarrier;
}

/// \returns true if the target supports backing off of s_barrier instructions
/// when an exception is raised.
bool supportsBackOffBarrier() const {
return BackOffBarrier;
}

bool hasUnalignedBufferAccess() const {
return UnalignedBufferAccess;
}
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1181,12 +1181,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
}

// Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
// occurs before the instruction. Doing it here prevents any additional
// S_WAITCNTs from being emitted if the instruction was marked as
// requiring a WAITCNT beforehand.
// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
// not, we need to ensure the subtarget is capable of backing off barrier
// instructions in case there are any outstanding memory operations that may
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier()) {
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}

Expand Down
97 changes: 97 additions & 0 deletions llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s

; Subtargets must wait for outstanding memory instructions before a barrier if
; they cannot back off of the barrier.

define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
; GFX9-NO-BACKOFF: ; %bb.0:
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: s_barrier
; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence:
; GFX9-BACKOFF: ; %bb.0:
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX9-BACKOFF-NEXT: s_barrier
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
; GFX10-BACKOFF: ; %bb.0:
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX10-BACKOFF-NEXT: s_barrier
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32* %in
call void @llvm.amdgcn.s.barrier()
store i32 %load, i32* %out
ret void
}

define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
; GFX9-NO-BACKOFF: ; %bb.0:
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: s_barrier
; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence:
; GFX9-BACKOFF: ; %bb.0:
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-BACKOFF-NEXT: s_barrier
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
; GFX10-BACKOFF: ; %bb.0:
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1]
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: s_barrier
; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: buffer_gl0_inv
; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0
; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32* %in
fence syncscope("workgroup") release
call void @llvm.amdgcn.s.barrier()
fence syncscope("workgroup") acquire
store i32 %load, i32* %out
ret void
}

declare void @llvm.amdgcn.s.barrier()

attributes #0 = { nounwind }
42 changes: 19 additions & 23 deletions llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,15 @@ body: |
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: S_WAITCNT 0
; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10-NEXT: S_BARRIER
; GFX10-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10-NEXT: S_WAITCNT 112
; GFX10-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10-NEXT: S_ENDPGM 0
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT_VSCNT undef $sgpr_null, 1
S_BARRIER
Expand Down Expand Up @@ -116,18 +114,16 @@ body: |
liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: S_WAITCNT 0
; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10-NEXT: S_WAITCNT 0
; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10-NEXT: S_BARRIER
; GFX10-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10-NEXT: S_WAITCNT 112
; GFX10-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10-NEXT: S_ENDPGM 0
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0
; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
; GFX10: S_WAITCNT 0
; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX10: S_BARRIER
; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_WAITCNT 112
; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
; GFX10: S_ENDPGM 0
GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
S_WAITCNT 0
S_WAITCNT_VSCNT undef $sgpr_null, 1
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s

; GCN-LABEL: barrier_vmcnt_global:
; GFX8: flat_load_dword
Expand Down Expand Up @@ -42,7 +42,7 @@ bb:
%tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
store i32 0, i32 addrspace(1)* %tmp5, align 4
fence syncscope("singlethread") release
tail call void @llvm.amdgcn.s.barrier() #3
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("singlethread") acquire
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
%tmp7 = lshr exact i64 %tmp6, 32
Expand Down Expand Up @@ -116,7 +116,7 @@ bb:
%tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
store i32 0, i32* %tmp5, align 4
fence syncscope("singlethread") release
tail call void @llvm.amdgcn.s.barrier() #3
tail call void @llvm.amdgcn.s.barrier()
fence syncscope("singlethread") acquire
%tmp6 = add nuw nsw i64 %tmp2, 4294967296
%tmp7 = lshr exact i64 %tmp6, 32
Expand Down

0 comments on commit 2c82a12

Please sign in to comment.