Skip to content

Commit 0daa120

Browse files
committed
[WIP][DAG] visitFREEZE - always allow freezing multiple operands
Remove the limited freeze multiple operand handling, always freeze all operands and rely on later visitFREEZE calls to merge frozen/unfrozen versions of each node to prevent infinite loops. This also removes the special handling of frozen SRA/SRL nodes as most of the regressions are related Fixes #149798 Fixes #150204
1 parent 0c1087b commit 0daa120

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+9495
-8527
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 6 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -16918,15 +16918,11 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1691816918
// creating a cycle in a DAG. Let's undo that by mutating the freeze.
1691916919
assert(N->getOperand(0) == FrozenN0 && "Expected cycle in DAG");
1692016920
DAG.UpdateNodeOperands(N, N0);
16921+
// Revisit the node.
16922+
AddToWorklist(N);
1692116923
return FrozenN0;
1692216924
}
1692316925

16924-
// We currently avoid folding freeze over SRA/SRL, due to the problems seen
16925-
// with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
16926-
// example https://reviews.llvm.org/D136529#4120959.
16927-
if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
16928-
return SDValue();
16929-
1693016926
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
1693116927
// Try to push freeze through instructions that propagate but don't produce
1693216928
// poison as far as possible. If an operand of freeze follows three
@@ -16939,19 +16935,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1693916935
N0->getNumValues() != 1 || !N0->hasOneUse())
1694016936
return SDValue();
1694116937

16942-
// TOOD: we should always allow multiple operands, however this increases the
16943-
// likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
16944-
// below causing later nodes that share frozen operands to fold again and no
16945-
// longer being able to confirm other operands are not poison due to recursion
16946-
// depth limits on isGuaranteedNotToBeUndefOrPoison.
16947-
bool AllowMultipleMaybePoisonOperands =
16948-
N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
16949-
N0.getOpcode() == ISD::BUILD_VECTOR ||
16950-
N0.getOpcode() == ISD::INSERT_SUBVECTOR ||
16951-
N0.getOpcode() == ISD::BUILD_PAIR ||
16952-
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
16953-
N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
16954-
1695516938
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
1695616939
// ones" or "constant" into something that depends on FrozenUndef. We can
1695716940
// instead pick undef values to keep those properties, while at the same time
@@ -16972,74 +16955,13 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1697216955
}
1697316956
}
1697416957

16975-
SmallSet<SDValue, 8> MaybePoisonOperands;
16976-
SmallVector<unsigned, 8> MaybePoisonOperandNumbers;
16977-
for (auto [OpNo, Op] : enumerate(N0->ops())) {
16978-
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly=*/false))
16979-
continue;
16980-
bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
16981-
bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
16982-
if (IsNewMaybePoisonOperand)
16983-
MaybePoisonOperandNumbers.push_back(OpNo);
16984-
if (!HadMaybePoisonOperands)
16985-
continue;
16986-
if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
16987-
// Multiple maybe-poison ops when not allowed - bail out.
16988-
return SDValue();
16989-
}
16990-
}
16991-
// NOTE: the whole op may be not guaranteed to not be undef or poison because
16992-
// it could create undef or poison due to it's poison-generating flags.
16993-
// So not finding any maybe-poison operands is fine.
16994-
16995-
for (unsigned OpNo : MaybePoisonOperandNumbers) {
16996-
// N0 can mutate during iteration, so make sure to refetch the maybe poison
16997-
// operands via the operand numbers. The typical scenario is that we have
16998-
// something like this
16999-
// t262: i32 = freeze t181
17000-
// t150: i32 = ctlz_zero_undef t262
17001-
// t184: i32 = ctlz_zero_undef t181
17002-
// t268: i32 = select_cc t181, Constant:i32<0>, t184, t186, setne:ch
17003-
// When freezing the t181 operand we get t262 back, and then the
17004-
// ReplaceAllUsesOfValueWith call will not only replace t181 by t262, but
17005-
// also recursively replace t184 by t150.
17006-
SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo);
17007-
// Don't replace every single UNDEF everywhere with frozen UNDEF, though.
17008-
if (MaybePoisonOperand.isUndef())
17009-
continue;
17010-
// First, freeze each offending operand.
17011-
SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
17012-
// Then, change all other uses of unfrozen operand to use frozen operand.
17013-
DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
17014-
if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
17015-
FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
17016-
// But, that also updated the use in the freeze we just created, thus
17017-
// creating a cycle in a DAG. Let's undo that by mutating the freeze.
17018-
DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
17019-
MaybePoisonOperand);
17020-
}
17021-
17022-
// This node has been merged with another.
17023-
if (N->getOpcode() == ISD::DELETED_NODE)
17024-
return SDValue(N, 0);
17025-
}
17026-
17027-
assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted!");
17028-
17029-
// The whole node may have been updated, so the value we were holding
17030-
// may no longer be valid. Re-fetch the operand we're `freeze`ing.
17031-
N0 = N->getOperand(0);
16958+
// Collect and freeze all operands.
16959+
SmallVector<SDValue> Ops(N0->ops());
16960+
for (auto &Op : Ops)
16961+
Op = DAG.getFreeze(Op);
1703216962

1703316963
// Finally, recreate the node, it's operands were updated to use
1703416964
// frozen operands, so we just need to use it's "original" operands.
17035-
SmallVector<SDValue> Ops(N0->ops());
17036-
// TODO: ISD::UNDEF and ISD::POISON should get separate handling, but best
17037-
// leave for a future patch.
17038-
for (SDValue &Op : Ops) {
17039-
if (Op.isUndef())
17040-
Op = DAG.getFreeze(Op);
17041-
}
17042-
1704316965
SDLoc DL(N0);
1704416966

1704516967
// Special case handling for ShuffleVectorSDNode nodes.

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
129129
; GCN-NEXT: s_getpc_b64 s[4:5]
130130
; GCN-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
131131
; GCN-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
132-
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
132+
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
133133
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
134134
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
135135
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -211,7 +211,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
211211
; GCN-NEXT: s_getpc_b64 s[4:5]
212212
; GCN-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4
213213
; GCN-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
214-
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
214+
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
215215
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
216216
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
217217
; GCN-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
189189
; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1]
190190
; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
191191
; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
192-
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
192+
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
193193
; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
194194
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
195195
; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7
@@ -225,7 +225,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
225225
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
226226
; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0
227227
; GFX1250-NEXT: v_cmp_ne_u32_e64 s2, 0, v11
228-
; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
228+
; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s1
229229
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
230230
; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
231231
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -437,30 +437,33 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
437437
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
438438
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
439439
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
440-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
441-
; GFX9-O0-NEXT: s_nop 0
442-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
443440
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
444441
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
445-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
442+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
443+
; GFX9-O0-NEXT: s_nop 0
444+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
445+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
446+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
447+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
446448
; GFX9-O0-NEXT: s_nop 0
447-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
449+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
448450
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
449451
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
450-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
452+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
453+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
451454
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
452455
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
453456
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
454457
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
455458
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
456459
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1
457460
; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
458-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1
459-
; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
460-
; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15]
461+
; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1
462+
; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5]
461463
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
462464
; GFX9-O0-NEXT: s_mov_b32 s14, s13
463465
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
466+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
464467
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
465468
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
466469
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -474,17 +477,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
474477
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
475478
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
476479
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7]
477-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
478480
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
479-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13]
480-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
481+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
481482
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
482-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
483+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
483484
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
484485
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
485-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
486486
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
487-
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
487+
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
488488
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
489489
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
490490
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -977,10 +977,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
977977
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
978978
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
979979
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
980-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
981-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
982-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
983-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
980+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
981+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
982+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
983+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
984984
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
985985
; GFX9-O0-NEXT: s_mov_b32 s5, s6
986986
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2564,17 +2564,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
25642564
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
25652565
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
25662566
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
2567-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2568-
; GFX9-O0-NEXT: s_nop 0
2569-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
25702567
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
25712568
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
2572-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2569+
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
25732570
; GFX9-O0-NEXT: s_nop 0
2574-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2571+
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
2572+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
2573+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
2574+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2575+
; GFX9-O0-NEXT: s_nop 0
2576+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
25752577
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
25762578
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
2577-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
2579+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
2580+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
25782581
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
25792582
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
25802583
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
@@ -2587,6 +2590,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
25872590
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
25882591
; GFX9-O0-NEXT: s_mov_b32 s14, s13
25892592
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
2593+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
25902594
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
25912595
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
25922596
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3100,10 +3104,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
31003104
; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
31013105
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
31023106
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
3103-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3104-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3105-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3106-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3107+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3108+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3109+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3110+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
31073111
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
31083112
; GFX9-O0-NEXT: s_mov_b32 s5, s6
31093113
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,13 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
1414
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
1515
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
1616
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
17-
; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
18-
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
19-
; GCN-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
20-
; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
21-
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
22-
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
17+
; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
18+
; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]]
19+
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[COPY3]], implicit-def dead $scc
20+
; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
2321
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
24-
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
25-
; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
22+
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
23+
; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
2624
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
2725
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
2826
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec

llvm/test/CodeGen/AMDGPU/fptoi.i128.ll

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,25 +1437,15 @@ define i128 @fptoui_f32_to_i128(float %x) {
14371437
}
14381438

14391439
define i128 @fptosi_f16_to_i128(half %x) {
1440-
; SDAG-LABEL: fptosi_f16_to_i128:
1441-
; SDAG: ; %bb.0:
1442-
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443-
; SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
1444-
; SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
1445-
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1446-
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
1447-
; SDAG-NEXT: v_mov_b32_e32 v3, v2
1448-
; SDAG-NEXT: s_setpc_b64 s[30:31]
1449-
;
1450-
; GISEL-LABEL: fptosi_f16_to_i128:
1451-
; GISEL: ; %bb.0:
1452-
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1453-
; GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
1454-
; GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
1455-
; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1456-
; GISEL-NEXT: v_mov_b32_e32 v2, v1
1457-
; GISEL-NEXT: v_mov_b32_e32 v3, v1
1458-
; GISEL-NEXT: s_setpc_b64 s[30:31]
1440+
; GCN-LABEL: fptosi_f16_to_i128:
1441+
; GCN: ; %bb.0:
1442+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443+
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
1444+
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
1445+
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1446+
; GCN-NEXT: v_mov_b32_e32 v2, v1
1447+
; GCN-NEXT: v_mov_b32_e32 v3, v1
1448+
; GCN-NEXT: s_setpc_b64 s[30:31]
14591449
%cvt = fptosi half %x to i128
14601450
ret i128 %cvt
14611451
}

0 commit comments

Comments
 (0)