[AMDGPU] Add llvm.amdgcn.wqm.helper intrinsic to complement demote

Originally llvm.amdgcn.wqm.demote worked with llvm.amdgcn.ps.live; however, this required changing the type signature of ps.live. While the type signature of ps.live can be updated in LLPC, other users such as Mesa should not be expected to change. Change-Id: I7a18e7931bd7f2b4e0bfd4dc5718ff3718bb9d27
jaebaek · Oct 10, 2019 · 4055769 · 4055769
1 parent 8e41591
commit 4055769
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 7 deletions.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1276,7 +1276,13 @@ def int_amdgcn_interp_p2_f16 :
 
 // Pixel shaders only: whether the current pixel is live (i.e. not a helper
 // invocation for derivative computation).
-def int_amdgcn_ps_live : Intrinsic <[llvm_i1_ty], [], []>;
+def int_amdgcn_ps_live : Intrinsic <
+  [llvm_i1_ty],
+  [],
+  [IntrNoMem]>;
+
+// Like ps.live, but cannot be moved by LICM.
+def int_amdgcn_wqm_helper : Intrinsic <[llvm_i1_ty], [], []>;
 
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -98,6 +98,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_wqm_helper>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -325,14 +325,19 @@ def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
   let isBranch = 1;
 }
 
-let Uses = [EXEC] in {
-
 def SI_PS_LIVE : PseudoInstSI <
   (outs SReg_1:$dst), (ins),
   [(set i1:$dst, (int_amdgcn_ps_live))]> {
   let SALU = 1;
 }
 
+let Uses = [EXEC] in {
+def SI_WQM_HELPER : PseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_wqm_helper))]> {
+  let SALU = 1;
+}
+
 let Defs = [EXEC] in {
 def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
 }

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -408,7 +408,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_WQM_HELPER) {
           LiveMaskQueries.push_back(&MI);
         } else if (Opcode == AMDGPU::SI_DEMOTE_I1) {
           DemoteInstrs.push_back(&MI);
@@ -1036,6 +1036,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
 
     switch (MI.getOpcode()) {
     case AMDGPU::SI_PS_LIVE:
+    case AMDGPU::SI_WQM_HELPER:
       lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM);
       break;
     case AMDGPU::SI_DEMOTE_I1: {

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -172,7 +172,7 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
   br label %.continue0
 
 .continue0:
-  %live = call i1 @llvm.amdgcn.ps.live()
+  %live = call i1 @llvm.amdgcn.wqm.helper()
   %live.cond = select i1 %live, i32 0, i32 1065353216
   %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
   %live.v0f = bitcast i32 %live.v0 to float
@@ -228,7 +228,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 
 .continue0:
   %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
-  %live = call i1 @llvm.amdgcn.ps.live()
+  %live = call i1 @llvm.amdgcn.wqm.helper()
   %live.cond = select i1 %live, i32 0, i32 %count
   %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
   %live.v0f = bitcast i32 %live.v0 to float
@@ -255,7 +255,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 }
 
 declare void @llvm.amdgcn.wqm.demote(i1) #0
-declare i1 @llvm.amdgcn.ps.live() #1
+declare i1 @llvm.amdgcn.wqm.helper() #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare float @llvm.amdgcn.wqm.f32(float) #1