From 2c82a126d762c14c2f3df2d03a6ae5fb37c3351a Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Thu, 24 Feb 2022 23:26:51 -0800 Subject: [PATCH] [AMDGPU] Omit unnecessary waitcnt before barriers It is not necessary to wait for all outstanding memory operations before barriers on hardware that can back off of the barrier in the event of an exception when traps are enabled. Add a new subtarget feature which tracks which HW has this ability. Reviewed By: #amdgpu, rampitec Differential Revision: https://reviews.llvm.org/D130722 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 30 ++++-- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 ++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 10 +- .../back-off-barrier-subtarget-feature.ll | 97 +++++++++++++++++++ .../AMDGPU/waitcnt-preexisting-vscnt.mir | 42 ++++---- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 6 +- 6 files changed, 153 insertions(+), 39 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 72db379360c25..3d39584ad41af 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -791,6 +791,12 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < "Hardware automatically inserts waitcnt before barrier" >; +def FeatureBackOffBarrier : SubtargetFeature <"back-off-barrier", + "BackOffBarrier", + "true", + "Hardware supports backing off s_barrier if an exception occurs" +>; + def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", "HasTrigReducedRange", "true", @@ -1101,7 +1107,8 @@ def FeatureISAVersion9_0_A : FeatureSet< FeatureMadMacF32Insts, FeatureSupportsSRAMECC, FeaturePackedTID, - FullRate64Ops]>; + FullRate64Ops, + FeatureBackOffBarrier]>; def FeatureISAVersion9_0_C : FeatureSet< [FeatureGFX9, @@ -1138,7 +1145,8 @@ def FeatureISAVersion9_4_0 : FeatureSet< FeatureSupportsSRAMECC, FeaturePackedTID, FeatureArchitectedFlatScratch, - FullRate64Ops]>; + FullRate64Ops, + FeatureBackOffBarrier]>; // TODO: Organize more features into groups. def FeatureGroup { @@ -1173,7 +1181,8 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_1 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1195,7 +1204,8 @@ def FeatureISAVersion10_1_1 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_2 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1217,7 +1227,8 @@ def FeatureISAVersion10_1_2 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_1_3 : FeatureSet< !listconcat(FeatureGroup.GFX10_1_Bugs, @@ -1235,7 +1246,8 @@ def FeatureISAVersion10_1_3 : FeatureSet< FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLdsMisalignedBug, - FeatureSupportsXNACK])>; + FeatureSupportsXNACK, + FeatureBackOffBarrier])>; def FeatureISAVersion10_3_0 : FeatureSet< [FeatureGFX10, @@ -1252,7 +1264,8 @@ def FeatureISAVersion10_3_0 : FeatureSet< FeatureNSAEncoding, FeatureNSAMaxSize13, FeatureWavefrontSize32, - FeatureShaderCyclesRegister]>; + FeatureShaderCyclesRegister, + FeatureBackOffBarrier]>; def FeatureISAVersion11_Common : FeatureSet< [FeatureGFX11, @@ -1270,7 +1283,8 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureAtomicFaddNoRtnInsts, FeatureImageInsts, FeaturePackedTID, - FeatureVcmpxPermlaneHazard]>; + FeatureVcmpxPermlaneHazard, + FeatureBackOffBarrier]>; def FeatureISAVersion11_0_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 5a8be4d12de1e..b4a0dd7986515 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -71,6 +71,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // Dynamically set bits that enable features. bool FlatForGlobal = false; bool AutoWaitcntBeforeBarrier = false; + bool BackOffBarrier = false; bool UnalignedScratchAccess = false; bool UnalignedAccessMode = false; bool HasApertureRegs = false; @@ -506,6 +507,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return AutoWaitcntBeforeBarrier; } + /// \returns true if the target supports backing off of s_barrier instructions + /// when an exception is raised. + bool supportsBackOffBarrier() const { + return BackOffBarrier; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 349bcbf82195c..240d6a5723d56 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1181,12 +1181,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } } - // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 - // occurs before the instruction. Doing it here prevents any additional - // S_WAITCNTs from being emitted if the instruction was marked as - // requiring a WAITCNT beforehand. + // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does + // not, we need to ensure the subtarget is capable of backing off barrier + // instructions in case there are any outstanding memory operations that may + // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier()) { + !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll new file mode 100644 index 0000000000000..337dcfc652bd0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-back-off-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NO-BACKOFF %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-BACKOFF %s + +; Subtargets must wait for outstanding memory instructions before a barrier if +; they cannot back off of the barrier. + +define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 { +; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX9-NO-BACKOFF: ; %bb.0: +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_barrier +; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX9-BACKOFF: ; %bb.0: +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-BACKOFF-NEXT: s_barrier +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence: +; GFX10-BACKOFF: ; %bb.0: +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX10-BACKOFF-NEXT: s_barrier +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] + %load = load i32, i32* %in + call void @llvm.amdgcn.s.barrier() + store i32 %load, i32* %out + ret void +} + +define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 { +; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX9-NO-BACKOFF: ; %bb.0: +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_barrier +; GFX9-NO-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NO-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NO-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX9-BACKOFF: ; %bb.0: +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX9-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_barrier +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX9-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-BACKOFF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence: +; GFX10-BACKOFF: ; %bb.0: +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: flat_load_dword v0, v[0:1] +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_barrier +; GFX10-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: buffer_gl0_inv +; GFX10-BACKOFF-NEXT: flat_store_dword v[2:3], v0 +; GFX10-BACKOFF-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-BACKOFF-NEXT: s_setpc_b64 s[30:31] + %load = load i32, i32* %in + fence syncscope("workgroup") release + call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + store i32 %load, i32* %out + ret void +} + +declare void @llvm.amdgcn.s.barrier() + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir index 1365ff559f3e8..0c433dcb59bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir @@ -34,17 +34,15 @@ body: | liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: S_WAITCNT 0 - ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GFX10-NEXT: S_BARRIER - ; GFX10-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr - ; GFX10-NEXT: S_WAITCNT 112 - ; GFX10-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GFX10-NEXT: S_ENDPGM 0 + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec S_WAITCNT_VSCNT undef $sgpr_null, 1 S_BARRIER @@ -116,18 +114,16 @@ body: | liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types - ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: S_WAITCNT 0 - ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec - ; GFX10-NEXT: S_WAITCNT 0 - ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GFX10-NEXT: S_BARRIER - ; GFX10-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr - ; GFX10-NEXT: S_WAITCNT 112 - ; GFX10-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GFX10-NEXT: S_ENDPGM 0 + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 1 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec S_WAITCNT 0 S_WAITCNT_VSCNT undef $sgpr_null, 1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index f1c5c5b0ee65e..e78b5355c2d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword @@ -42,7 +42,7 @@ bb: %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 store i32 0, i32 addrspace(1)* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() #3 + tail call void @llvm.amdgcn.s.barrier() fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32 @@ -116,7 +116,7 @@ bb: %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4 store i32 0, i32* %tmp5, align 4 fence syncscope("singlethread") release - tail call void @llvm.amdgcn.s.barrier() #3 + tail call void @llvm.amdgcn.s.barrier() fence syncscope("singlethread") acquire %tmp6 = add nuw nsw i64 %tmp2, 4294967296 %tmp7 = lshr exact i64 %tmp6, 32