-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[Uniformity] Fixed control-div early stop #139667
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Junjie Gu (jgu222) ChangesControl-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This change fixes this issue by stopping at the post-dominator of the successors of the divergent branch. Full diff: https://github.com/llvm/llvm-project/pull/139667.diff 4 Files Affected:
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d10355fff1bea..f479abdbb41b6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -610,9 +610,6 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
<< Context.print(&DivTermBlock) << "\n");
- // Early stopping criterion
- int FloorIdx = CyclePOT.size() - 1;
- const BlockT *FloorLabel = nullptr;
int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
// Bootstrap with branch targets
@@ -626,14 +623,36 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
<< Context.print(SuccBlock) << "\n");
}
- auto SuccIdx = CyclePOT.getIndex(SuccBlock);
visitEdge(*SuccBlock, *SuccBlock);
- FloorIdx = std::min<int>(FloorIdx, SuccIdx);
}
+ // Return true if B is inside an irreducible cycle
+ auto IsInIrreducibleCycle = [this](const BlockT *B) {
+ for (const auto *Cycle = CI.getCycle(B); Cycle;
+ Cycle = Cycle->getParentCycle()) {
+ if (!Cycle->isReducible())
+ return true;
+ }
+ return false;
+ };
+
+ // Technically propagation can continue until it reaches the last node.
+ //
+ // For efficiency, propagation can just stop at the IPD (immediate
+ // post-dominator) of successors(DivTemBlock) for any reducible graph.
+ // If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
+ //
+ // For irreducible cycle, propagation continues until it reaches out of
+ // any irreducible cycles first, then stop when FreshLabels.count()=1.
while (true) {
auto BlockIdx = FreshLabels.find_last();
- if (BlockIdx == -1 || BlockIdx < FloorIdx)
+ if (BlockIdx == -1)
+ break;
+
+ const auto *Block = CyclePOT[BlockIdx];
+ // If no irreducible cycle, stop if freshLable.count() = 1 and Block
+ // is the IPD. If it is in any irreducible cycle, continue propagation.
+ if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
break;
LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +663,12 @@ template <typename ContextT> class DivergencePropagator {
continue;
}
- const auto *Block = CyclePOT[BlockIdx];
LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
<< BlockIdx << "\n");
const auto *Label = BlockLabels[Block];
assert(Label);
- bool CausedJoin = false;
- int LoweredFloorIdx = FloorIdx;
-
// If the current block is the header of a reducible cycle that
// contains the divergent branch, then the label should be
// propagated to the cycle exits. Such a header is the "last
@@ -681,28 +696,11 @@ template <typename ContextT> class DivergencePropagator {
if (const auto *BlockCycle = getReducibleParent(Block)) {
SmallVector<BlockT *, 4> BlockCycleExits;
BlockCycle->getExitBlocks(BlockCycleExits);
- for (auto *BlockCycleExit : BlockCycleExits) {
- CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
- LoweredFloorIdx =
- std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
- }
+ for (auto *BlockCycleExit : BlockCycleExits)
+ visitCycleExitEdge(*BlockCycleExit, *Label);
} else {
- for (const auto *SuccBlock : successors(Block)) {
- CausedJoin |= visitEdge(*SuccBlock, *Label);
- LoweredFloorIdx =
- std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
- }
- }
-
- // Floor update
- if (CausedJoin) {
- // 1. Different labels pushed to successors
- FloorIdx = LoweredFloorIdx;
- } else if (FloorLabel != Label) {
- // 2. No join caused BUT we pushed a label that is different than the
- // last pushed label
- FloorIdx = LoweredFloorIdx;
- FloorLabel = Label;
+ for (const auto *SuccBlock : successors(Block))
+ visitEdge(*SuccBlock, *Label);
}
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 46e676b52c0ba..5bb59602faca4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -126,10 +126,10 @@ exit:
;; only the inner cycle is reported as diverged.
;;
;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
-;; CHECK: depth=2: entries(T P) S Q R
-;; CHECK: CYCLES WITH DIVERGENT EXIT:
-;; CHECK: depth=1: entries(B A) D T S Q P R C
+;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
+;; NOCHECK: depth=2: entries(T P) S Q R
+;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
+;; NOCHECK: depth=1: entries(B A) D T S Q P R C
define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
new file mode 100644
index 0000000000000..df949a86635c4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -0,0 +1,78 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test an if-then-else case with some unmerged basic blocks
+; (https://github.com/llvm/llvm-project/issues/137277)
+;
+; Entry (div.cond)
+; / \
+; B0 B3
+; | |
+; B1 B4
+; | |
+; B2 B5
+; \ /
+; B6 (phi: divergent)
+;
+
+
+; CHECK-LABEL: 'test_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
+; CHECK: DIVERGENT: br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL: BLOCK B6
+; CHECK: DIVERGENT: %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
+; CHECK: DIVERGENT: %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
+; CHECK: DIVERGENT: %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %div.cond = icmp eq i32 %tid, 0
+ br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+ %a0 = add i32 %a, 1
+ br label %B1
+
+B1:
+ %b0 = add i32 %b, 2
+ br label %B2
+
+B2:
+ %c0 = add i32 %c, 3
+ br label %B6
+
+B3:
+ %a1 = add i32 %a, 10
+ br label %B4
+
+B4:
+ %b1 = add i32 %b, 20
+ br label %B5
+
+B5:
+ %c1 = add i32 %c, 30
+ br label %B6
+
+B6:
+ %div_a = phi i32 [%a0, %B2], [%a1, %B5]
+ %div_b = phi i32 [%b0, %B2], [%b1, %B5]
+ %div_c = phi i32 [%c0, %B2], [%c1, %B5]
+ br i1 %div.cond, label %B8, label %B7 ; divergent branch
+
+B7:
+ %d1 = add i32 %d, 1
+ br label %B8
+
+B8:
+ %div_d = phi i32 [%d1, %B7], [%d, %B6]
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
new file mode 100644
index 0000000000000..54c641862fe79
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -0,0 +1,82 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test a divergent phi involving loops
+; (https://github.com/llvm/llvm-project/issues/137277).
+;
+; B0 (div.cond)
+; / \
+; (L)B1 B4
+; | |
+; B2 B5 (L)
+; | |
+; B3 /
+; \ /
+; B6 (phi: divergent)
+;
+
+;
+; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT: %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-LABEL: BLOCK B0
+; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
+; CHECK-LABEL: BLOCK B3
+; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
+; CHECK-LABEL: BLOCK B5
+; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
+; CHECK-LABEL BLOCK B6
+; CHECK: DIVERGENT: %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
+;
+
+define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %uni.cond0 = icmp eq i32 %d, 0
+ br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
+
+B0:
+ %div.cond = icmp eq i32 %tid, 0
+ br i1 %div.cond, label %B4, label %B1 ; divergent branch
+
+B1:
+ %uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
+ %a0 = add i32 %uni.a0, 1
+ %uni.cond1 = icmp slt i32 %a0, %b
+ br i1 %uni.cond1, label %B1, label %B2
+
+B2:
+ %a1 = add i32 %a0, 10
+ br label %B3
+
+B3:
+ %uni_a = phi i32 [%a1, %B2], [%a, %Entry]
+ br label %B6
+
+B4:
+ %a2 = add i32 %a, 20
+ br label %B5
+
+B5:
+ %uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
+ %uni_a3 = add i32 %uni.a3, 1
+ %uni.cond2 = icmp slt i32 %uni_a3, %c
+ br i1 %uni.cond2, label %B5, label %B6
+
+B6:
+ %div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ; divergent
+ %div.cond2 = icmp eq i32 %tid, 2
+ br i1 %div.cond2, label %B7, label %B8 ; divergent branch
+
+B7:
+ %c0 = add i32 %div_a, 2 ; divergent
+ br label %B8
+
+B8:
+ %ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
|
@llvm/pr-subscribers-llvm-adt Author: Junjie Gu (jgu222) ChangesControl-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This change fixes this issue by stopping at the post-dominator of the successors of the divergent branch. Full diff: https://github.com/llvm/llvm-project/pull/139667.diff 4 Files Affected:
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index d10355fff1bea..f479abdbb41b6 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -610,9 +610,6 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: "
<< Context.print(&DivTermBlock) << "\n");
- // Early stopping criterion
- int FloorIdx = CyclePOT.size() - 1;
- const BlockT *FloorLabel = nullptr;
int DivTermIdx = CyclePOT.getIndex(&DivTermBlock);
// Bootstrap with branch targets
@@ -626,14 +623,36 @@ template <typename ContextT> class DivergencePropagator {
LLVM_DEBUG(dbgs() << "\tImmediate divergent cycle exit: "
<< Context.print(SuccBlock) << "\n");
}
- auto SuccIdx = CyclePOT.getIndex(SuccBlock);
visitEdge(*SuccBlock, *SuccBlock);
- FloorIdx = std::min<int>(FloorIdx, SuccIdx);
}
+ // Return true if B is inside an irreducible cycle
+ auto IsInIrreducibleCycle = [this](const BlockT *B) {
+ for (const auto *Cycle = CI.getCycle(B); Cycle;
+ Cycle = Cycle->getParentCycle()) {
+ if (!Cycle->isReducible())
+ return true;
+ }
+ return false;
+ };
+
+ // Technically propagation can continue until it reaches the last node.
+ //
+ // For efficiency, propagation can just stop at the IPD (immediate
+ // post-dominator) of successors(DivTemBlock) for any reducible graph.
+ // If FreshLabels.count()=1, the block in FreshLabels should be the IPD.
+ //
+ // For irreducible cycle, propagation continues until it reaches out of
+ // any irreducible cycles first, then stop when FreshLabels.count()=1.
while (true) {
auto BlockIdx = FreshLabels.find_last();
- if (BlockIdx == -1 || BlockIdx < FloorIdx)
+ if (BlockIdx == -1)
+ break;
+
+ const auto *Block = CyclePOT[BlockIdx];
+ // If no irreducible cycle, stop if freshLable.count() = 1 and Block
+ // is the IPD. If it is in any irreducible cycle, continue propagation.
+ if (FreshLabels.count() == 1 && !IsInIrreducibleCycle(Block))
break;
LLVM_DEBUG(dbgs() << "Current labels:\n"; printDefs(dbgs()));
@@ -644,16 +663,12 @@ template <typename ContextT> class DivergencePropagator {
continue;
}
- const auto *Block = CyclePOT[BlockIdx];
LLVM_DEBUG(dbgs() << "visiting " << Context.print(Block) << " at index "
<< BlockIdx << "\n");
const auto *Label = BlockLabels[Block];
assert(Label);
- bool CausedJoin = false;
- int LoweredFloorIdx = FloorIdx;
-
// If the current block is the header of a reducible cycle that
// contains the divergent branch, then the label should be
// propagated to the cycle exits. Such a header is the "last
@@ -681,28 +696,11 @@ template <typename ContextT> class DivergencePropagator {
if (const auto *BlockCycle = getReducibleParent(Block)) {
SmallVector<BlockT *, 4> BlockCycleExits;
BlockCycle->getExitBlocks(BlockCycleExits);
- for (auto *BlockCycleExit : BlockCycleExits) {
- CausedJoin |= visitCycleExitEdge(*BlockCycleExit, *Label);
- LoweredFloorIdx =
- std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(BlockCycleExit));
- }
+ for (auto *BlockCycleExit : BlockCycleExits)
+ visitCycleExitEdge(*BlockCycleExit, *Label);
} else {
- for (const auto *SuccBlock : successors(Block)) {
- CausedJoin |= visitEdge(*SuccBlock, *Label);
- LoweredFloorIdx =
- std::min<int>(LoweredFloorIdx, CyclePOT.getIndex(SuccBlock));
- }
- }
-
- // Floor update
- if (CausedJoin) {
- // 1. Different labels pushed to successors
- FloorIdx = LoweredFloorIdx;
- } else if (FloorLabel != Label) {
- // 2. No join caused BUT we pushed a label that is different than the
- // last pushed label
- FloorIdx = LoweredFloorIdx;
- FloorLabel = Label;
+ for (const auto *SuccBlock : successors(Block))
+ visitEdge(*SuccBlock, *Label);
}
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
index 46e676b52c0ba..5bb59602faca4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
@@ -126,10 +126,10 @@ exit:
;; only the inner cycle is reported as diverged.
;;
;; CHECK-LABEL: UniformityInfo for function 'headers_b_t':
-;; CHECK: CYCLES ASSSUMED DIVERGENT:
-;; CHECK: depth=2: entries(T P) S Q R
-;; CHECK: CYCLES WITH DIVERGENT EXIT:
-;; CHECK: depth=1: entries(B A) D T S Q P R C
+;; NOCHECK: CYCLES ASSSUMED DIVERGENT:
+;; NOCHECK: depth=2: entries(T P) S Q R
+;; NOCHECK: CYCLES WITH DIVERGENT EXIT:
+;; NOCHECK: depth=1: entries(B A) D T S Q P R C
define amdgpu_kernel void @headers_b_t(i32 %a, i32 %b, i32 %c) {
entry:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
new file mode 100644
index 0000000000000..df949a86635c4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_branch.ll
@@ -0,0 +1,78 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test an if-then-else case with some unmerged basic blocks
+; (https://github.com/llvm/llvm-project/issues/137277)
+;
+; Entry (div.cond)
+; / \
+; B0 B3
+; | |
+; B1 B4
+; | |
+; B2 B5
+; \ /
+; B6 (phi: divergent)
+;
+
+
+; CHECK-LABEL: 'test_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
+; CHECK: DIVERGENT: br i1 %div.cond, label %B3, label %B0
+;
+; CHECK-LABEL: BLOCK B6
+; CHECK: DIVERGENT: %div_a = phi i32 [ %a0, %B2 ], [ %a1, %B5 ]
+; CHECK: DIVERGENT: %div_b = phi i32 [ %b0, %B2 ], [ %b1, %B5 ]
+; CHECK: DIVERGENT: %div_c = phi i32 [ %c0, %B2 ], [ %c1, %B5 ]
+
+
+define amdgpu_kernel void @test_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %div.cond = icmp eq i32 %tid, 0
+ br i1 %div.cond, label %B3, label %B0 ; divergent branch
+
+B0:
+ %a0 = add i32 %a, 1
+ br label %B1
+
+B1:
+ %b0 = add i32 %b, 2
+ br label %B2
+
+B2:
+ %c0 = add i32 %c, 3
+ br label %B6
+
+B3:
+ %a1 = add i32 %a, 10
+ br label %B4
+
+B4:
+ %b1 = add i32 %b, 20
+ br label %B5
+
+B5:
+ %c1 = add i32 %c, 30
+ br label %B6
+
+B6:
+ %div_a = phi i32 [%a0, %B2], [%a1, %B5]
+ %div_b = phi i32 [%b0, %B2], [%b1, %B5]
+ %div_c = phi i32 [%c0, %B2], [%c1, %B5]
+ br i1 %div.cond, label %B8, label %B7 ; divergent branch
+
+B7:
+ %d1 = add i32 %d, 1
+ br label %B8
+
+B8:
+ %div_d = phi i32 [%d1, %B7], [%d, %B6]
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
new file mode 100644
index 0000000000000..54c641862fe79
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_div_loop.ll
@@ -0,0 +1,82 @@
+;
+; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
+;
+; This is to test a divergent phi involving loops
+; (https://github.com/llvm/llvm-project/issues/137277).
+;
+; B0 (div.cond)
+; / \
+; (L)B1 B4
+; | |
+; B2 B5 (L)
+; | |
+; B3 /
+; \ /
+; B6 (phi: divergent)
+;
+
+;
+; CHECK-LABEL: UniformityInfo for function 'test_loop_ctrl_divergence':
+; CHECK-LABEL: BLOCK Entry
+; CHECK: DIVERGENT: %tid = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-LABEL: BLOCK B0
+; CHECK: DIVERGENT: %div.cond = icmp eq i32 %tid, 0
+; CHECK-LABEL: BLOCK B3
+; CHECK: %uni_a = phi i32 [ %a1, %B2 ], [ %a, %Entry ]
+; CHECK-LABEL: BLOCK B5
+; CHECK: %uni.a3 = phi i32 [ %a2, %B4 ], [ %uni_a3, %B5 ]
+; CHECK-LABEL BLOCK B6
+; CHECK: DIVERGENT: %div_a = phi i32 [ %uni_a, %B3 ], [ %uni_a3, %B5 ]
+;
+
+define amdgpu_kernel void @test_loop_ctrl_divergence(i32 %a, i32 %b, i32 %c, i32 %d) {
+Entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %uni.cond0 = icmp eq i32 %d, 0
+ br i1 %uni.cond0, label %B3, label %B0 ; uniform branch
+
+B0:
+ %div.cond = icmp eq i32 %tid, 0
+ br i1 %div.cond, label %B4, label %B1 ; divergent branch
+
+B1:
+ %uni.a0 = phi i32 [%a, %B0], [%a0, %B1]
+ %a0 = add i32 %uni.a0, 1
+ %uni.cond1 = icmp slt i32 %a0, %b
+ br i1 %uni.cond1, label %B1, label %B2
+
+B2:
+ %a1 = add i32 %a0, 10
+ br label %B3
+
+B3:
+ %uni_a = phi i32 [%a1, %B2], [%a, %Entry]
+ br label %B6
+
+B4:
+ %a2 = add i32 %a, 20
+ br label %B5
+
+B5:
+ %uni.a3= phi i32 [%a2, %B4], [%uni_a3, %B5]
+ %uni_a3 = add i32 %uni.a3, 1
+ %uni.cond2 = icmp slt i32 %uni_a3, %c
+ br i1 %uni.cond2, label %B5, label %B6
+
+B6:
+ %div_a = phi i32 [%uni_a, %B3], [%uni_a3, %B5] ; divergent
+ %div.cond2 = icmp eq i32 %tid, 2
+ br i1 %div.cond2, label %B7, label %B8 ; divergent branch
+
+B7:
+ %c0 = add i32 %div_a, 2 ; divergent
+ br label %B8
+
+B8:
+ %ret = phi i32 [%c0, %B7], [0, %B6] ; divergent
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = {nounwind readnone }
|
I agree that #138806 should be discarded. I was able to construct an example which fails with that change. I was about to post that example, but then I saw this PR. But what is the reasoning about the irreducible cycle? There are three cases:
The patch seems to combine case 3 with case 2. Clearly, in case 3 we should stop when I am somewhat concerned that we discovered a glaring bug in the early stopping criterion, and would prefer to be way more cautious when we are revisiting. Note that the IPD is the exact place where the propagation of labels should stop, because that is the last join point of |
I agree that IPD is the last join and the algo should stop at the IPD. I did experiment disabling the early stop, and I got some regressions on irreducible lit tests. I am not sure what is wrong and think maybe irreducible cycles are specially handled and rely on particular propagation. As you think the IPD should be good for irreducible cycles too, I will redo the patch to use dominator analysis. |
@ssahasra Given the following:
In this example, ModifiedPO numbers are in parentheses.
I don't see issues for non-irreducible cycles with |
From the new testcase above, I believe this statement is wrong. E1 is a join that is reachable from A using a path that passes through B (the IPD) and C. So we cannot simply stop at IPD as in #140013 , and we cannot always use it as a floor either. Falling back to first principles, the overall idea is that we have an ordering provided by ModifiedPO, and starting from the divergent branch, we traverse that order to find every join point. If the IPD is not inside any cycle, then we do not have to look beyond the IPD to find all the joins. If the IPD is inside a cycle (doesn't matter if the branch is in the same cycle or not), then there can be a join that is discovered only after encountering the IPD. In this case, the cycle header is guaranteed to be earlier in ModifiedPO by design, and should be used as the floor. We don't have to follow any fresh label that comes after the header. So I think the correct solution is to compute the floor as the minimum of of IPD as one argument and the header of the cycle that contains the IPD as the second argument. But which cycle? Is it the smallest cycle containing the IPD, or the largest cycle? Or the common parent of both the divergent branch and the IPD? |
What I have in mind is something like this:
Then eliminate all the logic for updating FloorIdx, because we already know the minimum, and do the usual traversal that visits all newly labelled blocks such that EDIT: I don't think the problem is limited to irreducible cycles. I have not tried yet, but we could modify the example above with |
Continuing along the previous comment, now I am wondering if we even need the ModifiedPO ordering. As far as I can see, for a node N with an IPDom P, if there is a common cycle C containing N and P, then we need to propagate labels through all the nodes in C until a fixed point is reached, else if there is no C, then propagate labels up to P. In either case, a simple BFS is enough and ModifiedPO gives no advantage. |
This is a little tricky. If the execution favors A->E1->E2 cycles, ie, if there is any active lanes going to E1, the code will execute E1. The execution goes from A to B only if all active lanes at A goes to B. If so, E1 is not a join point. This is more like a lit test under AMDGPU/irreducible/irreducible-2.ll::@natural_loop_two_backedges(), which has the following CFG:
B is DivTermBlock, is ph in H uniform ? If the execution will honor the inner loop first, the phi in H is uniform. If the execution at B favors C (outer loop), then phi in H is divergent. Unless we are sure the inner loop is always favored, we have to treat phi in H as divergent. This also means that IPD isn't necessarily convergence point. If so, IPD should not be used. Is IPD the max-convergence ? This patch (139667) does not assume IPD is a convergence point. It will show E1 is a join if making this example a reducible CFG by removing entry->E2 edge. |
It seems correct. But It seems to me that we may choose to stay the current course without using IPD ? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jgu222 , I see your point now. See comment about checking for outermost irreducible cycle.
Thanks. I will come out a modified patch based on the feedback early next week. |
ad35048
to
111370d
Compare
With the latest patch, all lit tests passed. |
111370d
to
83b0a6b
Compare
The disabled test needs to be re-enabled as commented before submitting. Also, can you please include the tests from this commit: https://github.com/llvm/llvm-project/tree/users/ssahasra/uniformity-more-tests Just go ahead and cherry-pick that into this current PR. |
llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
Outdated
Show resolved
Hide resolved
llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
Outdated
Show resolved
Hide resolved
llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
Outdated
Show resolved
Hide resolved
llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers-nested.ll
Outdated
Show resolved
Hide resolved
a32ee23
to
2381f78
Compare
// Return true if B is inside an irreducible cycle | ||
auto IsInIrreducibleCycle = [this](const BlockT *B) { | ||
for (const auto *Cycle = CI.getCycle(B); Cycle; | ||
Cycle = Cycle->getParentCycle()) { | ||
// If everything is inside a reducible cycle, then look no further | ||
if (Cycle->isReducible() && Cycle->contains(&DivTermBlock)) | ||
return false; | ||
if (!Cycle->isReducible()) | ||
return true; | ||
} | ||
return false; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Return true if B is inside an irreducible cycle | |
auto IsInIrreducibleCycle = [this](const BlockT *B) { | |
for (const auto *Cycle = CI.getCycle(B); Cycle; | |
Cycle = Cycle->getParentCycle()) { | |
// If everything is inside a reducible cycle, then look no further | |
if (Cycle->isReducible() && Cycle->contains(&DivTermBlock)) | |
return false; | |
if (!Cycle->isReducible()) | |
return true; | |
} | |
return false; | |
}; | |
// Locate the largest ancestor cycle that is not reducible and does not | |
// contain a reducible ancestor. This is done with a lambda that is defined | |
// and invoked in the same statement. | |
const CycleT *IrreducibleAncestor = [](const CycleT *C) -> const CycleT* { | |
if (!C) return nullptr; | |
if (C->isReducible()) return nullptr; | |
while (const CycleT *P = C->getParentCycle()) { | |
if (P->isReducible()) return C; | |
C = P; | |
} | |
assert(!C->getParentCycle()); | |
assert(!C->isReducible()); | |
return C; | |
} (DivTermCycle); |
I think I now understand what we have been missing. If DivTermBlock
is inside a reducible cycle, the traversal does not need to propagate beyond that cycle. If it is not inside any cycle, then the traversal can stop at the IPD. In either case, traversal can stop when there is only one FreshLabel
left. Let R be the largest irreducible cycle containing DivTermBlock
such that any subcycle X that contains DivTermBlock
is also irreducible. Thus, R either has no parent, or the parent is reducible. The traversal needs to visit every block of R. Thus for the stopping condition we have:
if (FreshLabels.count() == 1 && (!IrreducibleAncestor || !IrreducibleAncestor->contains(Block)))
break;
I have pushed this change to the following branch. It also contains all the extra tests that I had mentioned earlier.
https://github.com/llvm/llvm-project/tree/users/ssahasra/uniform_issue_137277
If this logic is convincing, please do cherry-pick this into your PR!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, cherry-picked amd pushed.
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This change fixes this issue by stopping at the post-dominator of the successors of the divergent branch. llvm#137277
Based on feedback, modifying early stop involving loops and minor lit test fix.
2381f78
to
7a45497
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
@ssahasra Could you merge it ? I don't have permission to do it (no merge button for me) |
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This PR, also included changes from ssahasra, fixes this issue by stopping no early than the post-dominator of the divergent branch. #137277 --------- Co-authored-by: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This PR, also included changes from ssahasra, fixes this issue by stopping no early than the post-dominator of the divergent branch. llvm#137277 --------- Co-authored-by: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases. This PR, also included changes from ssahasra, fixes this issue by stopping no early than the post-dominator of the divergent branch. llvm#137277 --------- Co-authored-by: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Control-divergence finds joins by propagating labels from the divergent control branch. The code that checks the early stop for propagation is not correct in some cases.
This PR, also included changes from ssahasra, fixes this issue by stopping no early than the post-dominator of the divergent branch.
#137277