Skip to content

Commit 9843843

Browse files
authored
SelectionDAG: Do not propagate divergence through copy glue (#101210)
This fixes DAG divergence mishandling inline asm. This was considering the glue nodes for divergence, when the divergence should only come from the individual CopyFromRegs that are glued. As a result, having any VGPR CopyFromRegs would taint any uniform SGPR copies as divergent, resulting in SGPR copies to VGPR virtual registers later.
1 parent f86ce86 commit 9843843

File tree

5 files changed

+83
-12
lines changed

5 files changed

+83
-12
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11587,6 +11587,19 @@ class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
1158711587

1158811588
} // end anonymous namespace
1158911589

11590+
/// Return true if a glue output should propagate divergence information.
11591+
static bool gluePropagatesDivergence(const SDNode *Node) {
11592+
switch (Node->getOpcode()) {
11593+
case ISD::CopyFromReg:
11594+
case ISD::CopyToReg:
11595+
return false;
11596+
default:
11597+
return true;
11598+
}
11599+
11600+
llvm_unreachable("covered opcode switch");
11601+
}
11602+
1159011603
bool SelectionDAG::calculateDivergence(SDNode *N) {
1159111604
if (TLI->isSDNodeAlwaysUniform(N)) {
1159211605
assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
@@ -11596,7 +11609,11 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
1159611609
if (TLI->isSDNodeSourceOfDivergence(N, FLI, UA))
1159711610
return true;
1159811611
for (const auto &Op : N->ops()) {
11599-
if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
11612+
EVT VT = Op.getValueType();
11613+
11614+
// Skip Chain. It does not carry divergence.
11615+
if (VT != MVT::Other && Op.getNode()->isDivergent() &&
11616+
(VT != MVT::Glue || gluePropagatesDivergence(Op.getNode())))
1160011617
return true;
1160111618
}
1160211619
return false;
@@ -13135,8 +13152,14 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
1313513152
for (unsigned I = 0; I != Vals.size(); ++I) {
1313613153
Ops[I].setUser(Node);
1313713154
Ops[I].setInitial(Vals[I]);
13138-
if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
13139-
IsDivergent |= Ops[I].getNode()->isDivergent();
13155+
EVT VT = Ops[I].getValueType();
13156+
13157+
// Skip Chain. It does not carry divergence.
13158+
if (VT != MVT::Other &&
13159+
(VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
13160+
Ops[I].getNode()->isDivergent()) {
13161+
IsDivergent = true;
13162+
}
1314013163
}
1314113164
Node->NumOperands = Vals.size();
1314213165
Node->OperandList = Ops;

llvm/test/CodeGen/AMDGPU/dag-divergence.ll

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,20 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
2828
store i32 %maybe.not.uniform.load, ptr addrspace(1) undef
2929
ret void
3030
}
31+
32+
; This decomposes into a sequence of divergent sub carries. The first
33+
; subs in the sequence are divergent from the value inputs, but the
34+
; last values are divergent due to the carry in glue (such that
35+
; divergence needs to propagate through glue if there are any non-void
36+
; outputs)
37+
; GCN-LABEL: {{^}}wide_carry_divergence_error:
38+
; GCN: v_sub_u32_e32
39+
; GCN: v_subb_u32_e32
40+
; GCN: v_subbrev_u32_e32
41+
; GCN: v_subbrev_u32_e32
42+
define <2 x i128> @wide_carry_divergence_error(i128 %arg) {
43+
%i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false)
44+
%i1 = sub i128 0, %i
45+
%i2 = insertelement <2 x i128> zeroinitializer, i128 %i1, i64 0
46+
ret <2 x i128> %i2
47+
}

llvm/test/CodeGen/AMDGPU/inline-asm.ll

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,3 +332,34 @@ define void @scc_as_i1() {
332332
call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc)
333333
ret void
334334
}
335+
336+
; Make sure the SGPR def is treated as a uniform value when the inline
337+
; assembly also defines a divergent value. The add should be scalar
338+
; and not introduce illegal vgpr to sgpr copies.
339+
; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm:
340+
; CHECK: ; def v0 s[4:5]
341+
; CHECK: s_add_u32
342+
; CHECK-NEXT: s_addc_u32
343+
; CHECK: ; use s[4:5]
344+
define void @mixed_def_vgpr_sgpr_def_asm() {
345+
%vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"()
346+
%vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0
347+
%sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1
348+
%sgpr.add = add i64 %sgpr, 2
349+
call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
350+
ret void
351+
}
352+
353+
; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm:
354+
; CHECK: ; def s[4:5] v0
355+
; CHECK: s_add_u32
356+
; CHECK-NEXT: s_addc_u32
357+
; CHECK: ; use s[4:5]
358+
define void @mixed_def_sgpr_vgpr_def_asm() {
359+
%sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"()
360+
%sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0
361+
%vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1
362+
%sgpr.add = add i64 %sgpr, 2
363+
call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
364+
ret void
365+
}

llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
1414
; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4
1515
; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
16-
; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1
16+
; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG t8, Register:f32 $vgpr0, t8:1
1717

1818
; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0
1919
; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
2020
; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2
2121
; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
2222
; GCN-VERBOSE: t6: f32 = fadd [ORD=3] # D:1 t5, t4
2323
; GCN-VERBOSE: t8: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6
24-
; GCN-VERBOSE: t9: ch = RETURN_TO_EPILOG [ORD=4] # D:1 t8, Register:f32 $vgpr0 # D:0, t8:1
24+
; GCN-VERBOSE: t9: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t8, Register:f32 $vgpr0 # D:0, t8:1
2525

2626
define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector) {
2727
entry:

llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
1616
; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
1717
; CHECK-NEXT: t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
1818
; CHECK-NEXT: t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1
19-
; CHECK-NEXT: t19: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
19+
; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
2020
; CHECK-EMPTY:
2121
%loc = alloca i64, addrspace(5)
2222
%j = load i64, ptr addrspace(5) %loc
@@ -33,8 +33,8 @@ define i64 @i32_test(i32 %i) nounwind readnone {
3333
; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0>
3434
; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
3535
; CHECK-NEXT: t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
36-
; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t14, Register:i32 $vgpr1, t22, t14:1
37-
; CHECK-NEXT: t17: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
36+
; CHECK-NEXT: t16: ch,glue = CopyToReg t14, Register:i32 $vgpr1, t22, t14:1
37+
; CHECK-NEXT: t17: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
3838
; CHECK-EMPTY:
3939
%loc = alloca i32, addrspace(5)
4040
%j = load i32, ptr addrspace(5) %loc
@@ -54,8 +54,8 @@ define i64 @i16_test(i16 %i) nounwind readnone {
5454
; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
5555
; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
5656
; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
57-
; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
58-
; CHECK-NEXT: t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
57+
; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
58+
; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
5959
; CHECK-EMPTY:
6060
%loc = alloca i16, addrspace(5)
6161
%j = load i16, ptr addrspace(5) %loc
@@ -75,8 +75,8 @@ define i64 @i8_test(i8 %i) nounwind readnone {
7575
; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
7676
; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
7777
; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
78-
; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
79-
; CHECK-NEXT: t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
78+
; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
79+
; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
8080
; CHECK-EMPTY:
8181
%loc = alloca i8, addrspace(5)
8282
%j = load i8, ptr addrspace(5) %loc

0 commit comments

Comments
 (0)