Skip to content

[Uniformity] Fixed control-div early stop #139667

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 36 additions & 31 deletions llvm/include/llvm/ADT/GenericUniformityImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -610,13 +610,29 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
<< Context.print(&DivTermBlock) << "\n");

// Early stopping criterion
int FloorIdx = CyclePOT.size() - 1;
const BlockT *FloorLabel = nullptr;
int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);

// Bootstrap with branch targets
auto const *DivTermCycle = CI.getCycle(&DivTermBlock);

// Locate the largest ancestor cycle that is not reducible and does not
// contain a reducible ancestor. This is done with a lambda that is defined
// and invoked in the same statement.
const CycleT *IrreducibleAncestor = [](const CycleT *C) -> const CycleT * {
if (!C)
return nullptr;
if (C->isReducible())
return nullptr;
while (const CycleT *P = C->getParentCycle()) {
if (P->isReducible())
return C;
C = P;
}
assert(!C->getParentCycle());
assert(!C->isReducible());
return C;
}(DivTermCycle);

for (const auto *SuccBlock : successors(&DivTermBlock)) {
if (DivTermCycle && !DivTermCycle->contains(SuccBlock)) {
// If DivTerm exits the cycle immediately, computeJoin() might
Expand All @@ -626,14 +642,24 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
<< Context.print(SuccBlock) << "\n");
}
auto SuccIdx = CyclePOT.getIndex(SuccBlock);
visitEdge(*SuccBlock, *SuccBlock);
FloorIdx = std::min<int>(FloorIdx, SuccIdx);
}

// Technically propagation can continue until it reaches the last node.
//
// For efficiency, propagation can stop if FreshLabels.count()==1. But
// For irreducible cycles, let propagation continue until it reaches
// out of irreducible cycles (see code for details.)
while (true) {
auto BlockIdx = FreshLabels.find_last();
if (BlockIdx == -1 || BlockIdx < FloorIdx)
if (BlockIdx == -1)
break;

const auto *Block = CyclePOT[BlockIdx];
// If no irreducible cycle, stop if freshLable.count() = 1 and Block
// is the IPD. If it is in any irreducible cycle, continue propagation.
if (FreshLabels.count() == 1 &&
(!IrreducibleAncestor || !IrreducibleAncestor->contains(Block)))
break;

LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
Expand All @@ -644,16 +670,12 @@ template <typename ContextT> class DivergencePropagator {
continue;
}

const auto *Block = CyclePOT[BlockIdx];
LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
<< BlockIdx << "\n");

const auto *Label = BlockLabels[Block];
assert(Label);

bool CausedJoin = false;
int LoweredFloorIdx = FloorIdx;

// If the current block is the header of a reducible cycle that
// contains the divergent branch, then the label should be
// propagated to the cycle exits. Such a header is the "last
Expand Down Expand Up @@ -681,28 +703,11 @@ template <typename ContextT> class DivergencePropagator {
if (const auto *BlockCycle = getReducibleParent(Block)) {
SmallVector<BlockT *, 4> BlockCycleExits;
BlockCycle->getExitBlocks(BlockCycleExits);
for (auto *BlockCycleExit : BlockCycleExits) {
CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
LoweredFloorIdx =
std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
}
for (auto *BlockCycleExit : BlockCycleExits)
visitCycleExitEdge(*BlockCycleExit, *Label);
} else {
for (const auto *SuccBlock : successors(Block)) {
CausedJoin |= visitEdge(*SuccBlock, *Label);
LoweredFloorIdx =
std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
}
}

// Floor update
if (CausedJoin) {
// 1. Different labels pushed to successors
FloorIdx = LoweredFloorIdx;
} else if (FloorLabel != Label) {
// 2. No join caused BUT we pushed a label that is different than the
// last pushed label
FloorIdx = LoweredFloorIdx;
FloorLabel = Label;
for (const auto *SuccBlock : successors(Block))
visitEdge(*SuccBlock, *Label);
}
}

Expand Down
94 changes: 94 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/branch-after-join.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
;
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
;
;
; Entry (div.cond)
; / \
; B0 B3
; | |
; B1 B4
; | |
; \ /
; B5 (phi: divergent)
; |
; B6 (div.uni)
; / \
; B7 B9
; | |
; B8 B10
; | |
; \ /
; B11 (phi: uniform)


; CHECK-LABEL: 'test_ctrl_divergence':
; CHECK-LABEL: BLOCK Entry
; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
; CHECK: DIVERGENT: br i1 %div.cond, label %B3, label %B0
;
; CHECK-LABEL: BLOCK B5
; CHECK: DIVERGENT: %div_a = phi i32 [ %a0, %B1 ], [ %a1, %B4 ]
; CHECK: DIVERGENT: %div_b = phi i32 [ %b0, %B1 ], [ %b1, %B4 ]
;
; CHECK-LABEL: BLOCK B6
; CHECK-NOT: DIVERGENT: %uni.cond = icmp
; CHECK-NOT: DIVERGENT: br i1 %div.cond
;
; CHECK-LABEL: BLOCK B11
; CHECK-NOT: DIVERGENT: %div_d = phi i32


define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
Entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%div.cond = icmp eq i32 %tid, 0
br i1 %div.cond, label %B3, label %B0 ; divergent branch

B0:
%a0 = add i32 %a, 1
br label %B1

B1:
%b0 = add i32 %b, 2
br label %B5

B3:
%a1 = add i32 %a, 10
br label %B4

B4:
%b1 = add i32 %b, 20
br label %B5

B5:
%div_a = phi i32 [%a0, %B1], [%a1, %B4]
%div_b = phi i32 [%b0, %B1], [%b1, %B4]
br label %B6

B6:
%uni.cond = icmp eq i32 %c, 0
br i1 %uni.cond, label %B7, label %B9

B7:
%d1 = add i32 %d, 1
br label %B8

B8:
br label %B11

B9:
%d2 = add i32 %d, 3
br label %B10

B10:
br label %B11

B11:
%div_d = phi i32 [%d1, %B8], [%d2, %B10]
ret void
}


declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = {nounwind readnone }
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ exit:
;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
;; CHECK: CYCLES ASSSUMED DIVERGENT:
;; CHECK: depth=2: entries(T P) S Q R
;; CHECK: CYCLES WITH DIVERGENT EXIT:
;; CHECK: depth=1: entries(B A) D T S Q P R C
;; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:

define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
entry:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s

define amdgpu_kernel void @cycle_inner_ipd(i32 %n, i32 %a, i32 %b) #0 {
;
; entry
; / \
; E2<------E1
; | \ ^^
; | \ / |
; | v/ |
; | A |
; | / |
; | / |
; vv |
; B------->C
; |
; X
;
;
; CHECK-LABEL: BLOCK entry
; CHECK: DIVERGENT: %tid = call i32 @llvm.amdgcn.workitem.id.x()
; CHECK: DIVERGENT: %div.cond = icmp slt i32 %tid, 0
; CHECK: END BLOCK
;
; CHECK-LABEL: BLOCK B
; CHECK: DIVERGENT: %div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
; CHECK: END BLOCK

entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%div.cond = icmp slt i32 %tid, 0
%uni.cond = icmp slt i32 %a, 0
%uni.cond1 = icmp slt i32 %a, 2
%uni.cond2 = icmp slt i32 %a, 10
br i1 %uni.cond, label %E2, label %E1

E1:
br label %E2

E2:
br i1 %uni.cond1, label %A, label %B


A:
br i1 %div.cond, label %E1, label %B

B:
%div.merge = phi i32 [ 0, %A ], [ %b, %E2 ]
br label %C

C:
br i1 %uni.cond2, label %E1, label %X

X:
ret void
}
75 changes: 75 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
;
; This is to test an if-then-else case with some unmerged basic blocks
; (https://github.com/llvm/llvm-project/issues/137277)
;
; Entry (div.cond)
; / \
; B0 B3
; | |
; B1 B4
; | |
; B2 B5
; \ /
; B6 (phi: divergent)
;


; CHECK-LABEL: 'test_ctrl_divergence':
; CHECK-LABEL: BLOCK Entry
; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
; CHECK: DIVERGENT: br i1 %div.cond, label %B3, label %B0
;
; CHECK-LABEL: BLOCK B6
; CHECK: DIVERGENT: %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
; CHECK: DIVERGENT: %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
; CHECK: DIVERGENT: %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]


define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
Entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%div.cond = icmp eq i32 %tid, 0
br i1 %div.cond, label %B3, label %B0 ; divergent branch

B0:
%a0 = add i32 %a, 1
br label %B1

B1:
%b0 = add i32 %b, 2
br label %B2

B2:
%c0 = add i32 %c, 3
br label %B6

B3:
%a1 = add i32 %a, 10
br label %B4

B4:
%b1 = add i32 %b, 20
br label %B5

B5:
%c1 = add i32 %c, 30
br label %B6

B6:
%div_a = phi i32 [%a0, %B2], [%a1, %B5]
%div_b = phi i32 [%b0, %B2], [%b1, %B5]
%div_c = phi i32 [%c0, %B2], [%c1, %B5]
br i1 %div.cond, label %B8, label %B7 ; divergent branch

B7:
%d1 = add i32 %d, 1
br label %B8

B8:
%div_d = phi i32 [%d1, %B7], [%d, %B6]
ret void
}


declare i32 @llvm.amdgcn.workitem.id.x()
Loading
Loading