[AMDGPU] Fix AMDGPUUnifyDivergentExitNodes

Summary: For the case where "done" bits on existing exports are removed by unifyReturnBlockSet(), unify all return blocks - even the uniformly reached ones. We do not want to end up with a non-unified, uniformly reached block containing a normal export with the "done" bit cleared. That case is believed to be rare - possible with infinite loops in pixel shaders. This is a fix for D71192. Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76364
jaebaek · Mar 18, 2020 · d1a7bfc · d1a7bfc
1 parent f57290e
commit d1a7bfc
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 3 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -208,6 +208,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   // Loop over all of the blocks in a function, tracking all of the blocks that
   // return.
   SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
   SmallVector<BasicBlock *, 4> UnreachableBlocks;
 
   // Dummy return block for infinite loop.
@@ -219,6 +220,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         ReturningBlocks.push_back(BB);
+      else
+        UniformlyReachedRetBlocks.push_back(BB);
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         UnreachableBlocks.push_back(BB);
@@ -332,6 +335,18 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI
     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
+  // Unify returning blocks. If we are going to insert the export it is also
+  // necessary to include blocks that are uniformly reached, because in addition
+  // to inserting the export the "done" bits on existing exports will be cleared
+  // and we do not want to end up with the normal export in a non-unified,
+  // uniformly reached block with the "done" bit cleared.
+  auto BlocksToUnify = std::move(ReturningBlocks);
+  if (InsertExport) {
+    BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
+                         UniformlyReachedRetBlocks.end());
+  }
+
+  unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
+                      "UnifiedReturnBlock");
   return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -719,6 +719,46 @@ bb5:                                              ; preds = %bb3
   unreachable
 }
 
+; Test that there is an extra export inserted after the normal export,
+; if the normal export is inside a uniformly reached block and there is
+; an infinite loop in the pixel shader.
+
+; IR-LABEL: @uniformly_reached_export
+; IR-NEXT: .entry:
+; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]
+
+; IR: [[FLOW]]:
+; IR-NEXT: phi
+; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]]
+
+; IR: [[FLOW2]]:
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: [[EXP]]:
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
+; IR-NEXT: br label %[[FLOW]]
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR-NEXT: ret void
+
+define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
+.entry:
+  %tmp26 = fcmp olt float %tmp25, 0.000000e+00
+  br i1 %tmp26, label %.preheader.1, label %bb27
+
+.preheader.1:                                     ; preds = %.entry
+  br label %bb
+
+bb:                                               ; preds = %bb, %.preheader.1
+  br label %bb
+
+bb27:                                             ; preds = %.entry
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -17,8 +17,6 @@ define amdgpu_ps void @_amdgpu_ps_main() local_unnamed_addr #3 {
 ; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]]
 ; IR:       TransitionBlock:
 ; IR-NEXT:    br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]]
-; IR:       n31:
-; IR-NEXT:    ret void
 ; IR:       UnifiedReturnBlock:
 ; IR-NEXT:    call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
 ; IR-NEXT:    ret void