Skip to content

Commit

Permalink
[AMDGPU] Fix AMDGPUUnifyDivergentExitNodes
Browse files Browse the repository at this point in the history
Summary:
For the case where "done" bits on existing exports are removed
by unifyReturnBlockSet(), unify all return blocks - even the
uniformly reached ones. We do not want to end up with a non-unified,
uniformly reached block containing a normal export with the "done"
bit cleared.

That case is believed to be rare - possible with infinite loops
in pixel shaders.

This is a fix for D71192.

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76364
  • Loading branch information
piotrAMD committed Mar 18, 2020
1 parent f57290e commit d1a7bfc
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 3 deletions.
17 changes: 16 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
SmallVector<BasicBlock *, 4> UnreachableBlocks;

// Dummy return block for infinite loop.
Expand All @@ -219,6 +220,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
ReturningBlocks.push_back(BB);
else
UniformlyReachedRetBlocks.push_back(BB);
} else if (isa<UnreachableInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
UnreachableBlocks.push_back(BB);
Expand Down Expand Up @@ -332,6 +335,18 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
const TargetTransformInfo &TTI
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
// Unify returning blocks. If we are going to insert the export it is also
// necessary to include blocks that are uniformly reached, because in addition
// to inserting the export the "done" bits on existing exports will be cleared
// and we do not want to end up with the normal export in a non-unified,
// uniformly reached block with the "done" bit cleared.
auto BlocksToUnify = std::move(ReturningBlocks);
if (InsertExport) {
BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
UniformlyReachedRetBlocks.end());
}

unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
"UnifiedReturnBlock");
return true;
}
40 changes: 40 additions & 0 deletions llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,46 @@ bb5: ; preds = %bb3
unreachable
}

; Test that there is an extra export inserted after the normal export,
; if the normal export is inside a uniformly reached block and there is
; an infinite loop in the pixel shader.

; IR-LABEL: @uniformly_reached_export
; IR-NEXT: .entry:
; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]

; IR: [[FLOW]]:
; IR-NEXT: phi
; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]]

; IR: [[FLOW2]]:
; IR-NEXT: br label %UnifiedReturnBlock

; IR: [[EXP]]:
; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
; IR-NEXT: br label %[[FLOW]]

; IR: UnifiedReturnBlock:
; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
; IR-NEXT: ret void

define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
.entry:
%tmp26 = fcmp olt float %tmp25, 0.000000e+00
br i1 %tmp26, label %.preheader.1, label %bb27

.preheader.1: ; preds = %.entry
br label %bb

bb: ; preds = %bb, %.preheader.1
br label %bb

bb27: ; preds = %.entry
call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
ret void
}

declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #0
declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/update-phi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ define amdgpu_ps void @_amdgpu_ps_main() local_unnamed_addr #3 {
; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]]
; IR: TransitionBlock:
; IR-NEXT: br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]]
; IR: n31:
; IR-NEXT: ret void
; IR: UnifiedReturnBlock:
; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
; IR-NEXT: ret void
Expand Down

0 comments on commit d1a7bfc

Please sign in to comment.