Skip to content

Commit 4b1910b

Browse files
authored
[GlobalISel][AMDGPU] Import patterns with multiple defs (llvm#84171)
Fixes llvm#63216
1 parent 0ddb122 commit 4b1910b

File tree

9 files changed

+983
-958
lines changed

9 files changed

+983
-958
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -728,25 +728,34 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
728728
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
729729
} // End SubtargetPredicate = isGFX9Plus
730730

731-
// FIXME: GlobalISel in general does not handle instructions with 2 results,
732-
// so it cannot use these patterns.
733731
multiclass IMAD32_Pats <VOP3_Pseudo inst> {
734732
def : GCNPat <
735733
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
736-
(EXTRACT_SUBREG (inst $src0, $src1,
734+
(EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
737735
(REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
738736
$src2, sub0,
739737
(i32 (IMPLICIT_DEF)), sub1),
740738
0 /* clamp */),
741739
sub0)
742740
>;
741+
742+
// GISel-specific pattern that avoids creating a SGPR->VGPR copy if
743+
// $src2 is a VGPR.
744+
def : GCNPat <
745+
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, VGPR_32:$src2),
746+
(EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
747+
(REG_SEQUENCE VReg_64,
748+
$src2, sub0,
749+
(i32 (IMPLICIT_DEF)), sub1),
750+
0 /* clamp */),
751+
sub0)
752+
>;
753+
743754
// Immediate src2 in the pattern above will not fold because it would be partially
744755
// undef. Hence define specialized pattern for this case.
745-
// FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
746-
// make it SDAG only.
747756
def : GCNPat <
748-
(ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
749-
(EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
757+
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
758+
(EXTRACT_SUBREG (inst i32:$src0, i32:$src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
750759
>;
751760
}
752761

llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll

Lines changed: 91 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -8,34 +8,35 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
88
; GFX10-LABEL: v_mul_i64_no_zext:
99
; GFX10: ; %bb.0:
1010
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
11-
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
11+
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
1212
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1313
; GFX10-NEXT: s_clause 0x1
14-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
15-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
14+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
15+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
1616
; GFX10-NEXT: s_waitcnt vmcnt(0)
1717
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
18-
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
19-
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
20-
; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1
21-
; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[2:3]
18+
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
19+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
20+
; GFX10-NEXT: v_mov_b32_e32 v5, v0
21+
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
2222
; GFX10-NEXT: s_endpgm
2323
;
2424
; GFX11-LABEL: v_mul_i64_no_zext:
2525
; GFX11: ; %bb.0:
2626
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
27-
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
27+
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
2828
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2929
; GFX11-NEXT: s_clause 0x1
30-
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1]
31-
; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3]
30+
; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
31+
; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
3232
; GFX11-NEXT: s_waitcnt vmcnt(0)
3333
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
34-
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v3
35-
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2
34+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
35+
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
36+
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
3637
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
37-
; GFX11-NEXT: v_add3_u32 v5, v5, v0, v1
38-
; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3]
38+
; GFX11-NEXT: v_mov_b32_e32 v5, v7
39+
; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
3940
; GFX11-NEXT: s_nop 0
4041
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4142
; GFX11-NEXT: s_endpgm
@@ -64,8 +65,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
6465
; GFX10-NEXT: global_load_dword v4, v3, s[2:3]
6566
; GFX10-NEXT: s_waitcnt vmcnt(0)
6667
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0
67-
; GFX10-NEXT: v_mul_lo_u32 v0, v1, v4
68-
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
68+
; GFX10-NEXT: v_mov_b32_e32 v0, v3
69+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v4, v[0:1]
70+
; GFX10-NEXT: v_mov_b32_e32 v3, v0
6971
; GFX10-NEXT: v_mov_b32_e32 v0, 0
7072
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
7173
; GFX10-NEXT: s_endpgm
@@ -79,12 +81,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
7981
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
8082
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
8183
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
82-
; GFX11-NEXT: global_load_b32 v4, v2, s[0:1]
84+
; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
8385
; GFX11-NEXT: s_waitcnt vmcnt(0)
84-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v4, 0
85-
; GFX11-NEXT: v_mul_lo_u32 v0, v1, v4
86-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
87-
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
86+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
87+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
88+
; GFX11-NEXT: v_mov_b32_e32 v0, v3
89+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
90+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
8891
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
8992
; GFX11-NEXT: s_nop 0
9093
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,8 +117,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
114117
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
115118
; GFX10-NEXT: s_waitcnt vmcnt(0)
116119
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
117-
; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1
118-
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
120+
; GFX10-NEXT: v_mov_b32_e32 v0, v3
121+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
122+
; GFX10-NEXT: v_mov_b32_e32 v3, v0
119123
; GFX10-NEXT: v_mov_b32_e32 v0, 0
120124
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
121125
; GFX10-NEXT: s_endpgm
@@ -128,13 +132,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
128132
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
129133
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
130134
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
131-
; GFX11-NEXT: global_load_b32 v4, v1, s[6:7]
135+
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
132136
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
133137
; GFX11-NEXT: s_waitcnt vmcnt(0)
134-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0
135-
; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1
136-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
137-
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
138+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
139+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
140+
; GFX11-NEXT: v_mov_b32_e32 v0, v3
141+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
142+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
138143
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
139144
; GFX11-NEXT: s_nop 0
140145
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -211,8 +216,9 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
211216
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
212217
; GFX10-NEXT: s_waitcnt vmcnt(0)
213218
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
214-
; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1
215-
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
219+
; GFX10-NEXT: v_mov_b32_e32 v0, v3
220+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
221+
; GFX10-NEXT: v_mov_b32_e32 v3, v0
216222
; GFX10-NEXT: v_mov_b32_e32 v0, 0
217223
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
218224
; GFX10-NEXT: s_endpgm
@@ -225,13 +231,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
225231
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
226232
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
227233
; GFX11-NEXT: s_clause 0x1
228-
; GFX11-NEXT: global_load_b32 v4, v0, s[6:7]
234+
; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
229235
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
230236
; GFX11-NEXT: s_waitcnt vmcnt(0)
231-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0
232-
; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1
233-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
234-
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
237+
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
238+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
239+
; GFX11-NEXT: v_mov_b32_e32 v0, v3
240+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
241+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
235242
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
236243
; GFX11-NEXT: s_nop 0
237244
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,15 +397,16 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
390397
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
391398
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
392399
; GFX10-NEXT: s_waitcnt vmcnt(1)
393-
; GFX10-NEXT: v_and_b32_e32 v4, 0xfff00000, v0
394-
; GFX10-NEXT: v_and_b32_e32 v5, 0xf00f, v1
400+
; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
395401
; GFX10-NEXT: s_waitcnt vmcnt(0)
396-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v2, 0
397-
; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3
398-
; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2
399-
; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2
400-
; GFX10-NEXT: v_mov_b32_e32 v2, 0
401-
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
402+
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v6, v2, 0
403+
; GFX10-NEXT: v_mov_b32_e32 v0, v5
404+
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v6, v3, v[0:1]
405+
; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
406+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v2, v[5:6]
407+
; GFX10-NEXT: v_mov_b32_e32 v5, v0
408+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
409+
; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5]
402410
; GFX10-NEXT: s_endpgm
403411
;
404412
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -412,17 +420,18 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
412420
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
413421
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
414422
; GFX11-NEXT: s_waitcnt vmcnt(1)
415-
; GFX11-NEXT: v_and_b32_e32 v4, 0xfff00000, v0
416-
; GFX11-NEXT: v_and_b32_e32 v5, 0xf00f, v1
423+
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
417424
; GFX11-NEXT: s_waitcnt vmcnt(0)
418-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
419-
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
420-
; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
421-
; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
425+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
426+
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
427+
; GFX11-NEXT: v_mov_b32_e32 v0, v5
428+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
429+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
430+
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
431+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
422432
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
423-
; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
424-
; GFX11-NEXT: v_mov_b32_e32 v2, 0
425-
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
433+
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
434+
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5]
426435
; GFX11-NEXT: s_nop 0
427436
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
428437
; GFX11-NEXT: s_endpgm
@@ -491,27 +500,31 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
491500
; GFX10-NEXT: s_clause 0x1
492501
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
493502
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
494-
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
503+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
495504
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
496505
; GFX10-NEXT: s_clause 0x1
497-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
498-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
506+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
507+
; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
508+
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
499509
; GFX10-NEXT: s_waitcnt vmcnt(1)
500510
; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
501-
; GFX10-NEXT: s_waitcnt vmcnt(0)
502-
; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1
503511
; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
504512
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
513+
; GFX10-NEXT: s_cbranch_execz .LBB10_2
505514
; GFX10-NEXT: ; %bb.1: ; %else
506-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s1, v2, v0, 0
507-
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v1
508-
; GFX10-NEXT: v_mov_b32_e32 v0, v2
509-
; GFX10-NEXT: v_mov_b32_e32 v1, v3
510-
; GFX10-NEXT: ; %bb.2: ; %Flow
515+
; GFX10-NEXT: s_waitcnt vmcnt(0)
516+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0
517+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2]
518+
; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
519+
; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
520+
; GFX10-NEXT: .LBB10_2: ; %Flow
511521
; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0
522+
; GFX10-NEXT: s_cbranch_execz .LBB10_4
512523
; GFX10-NEXT: ; %bb.3: ; %if
524+
; GFX10-NEXT: s_waitcnt vmcnt(0)
525+
; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5
513526
; GFX10-NEXT: v_mov_b32_e32 v0, 0
514-
; GFX10-NEXT: ; %bb.4: ; %endif
527+
; GFX10-NEXT: .LBB10_4: ; %endif
515528
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
516529
; GFX10-NEXT: v_mov_b32_e32 v2, 0
517530
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -526,22 +539,29 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
526539
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
527540
; GFX11-NEXT: s_clause 0x1
528541
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7]
529-
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
542+
; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1]
530543
; GFX11-NEXT: s_mov_b32 s0, exec_lo
531-
; GFX11-NEXT: s_waitcnt vmcnt(0)
532-
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
544+
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
545+
; GFX11-NEXT: s_waitcnt vmcnt(1)
533546
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
534547
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
548+
; GFX11-NEXT: s_cbranch_execz .LBB10_2
535549
; GFX11-NEXT: ; %bb.1: ; %else
536-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v2, v0, 0
550+
; GFX11-NEXT: s_waitcnt vmcnt(0)
551+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
537552
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
538-
; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
539-
; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
540-
; GFX11-NEXT: ; %bb.2: ; %Flow
553+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
554+
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
555+
; GFX11-NEXT: v_mov_b32_e32 v1, v3
556+
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
557+
; GFX11-NEXT: .LBB10_2: ; %Flow
541558
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
559+
; GFX11-NEXT: s_cbranch_execz .LBB10_4
542560
; GFX11-NEXT: ; %bb.3: ; %if
561+
; GFX11-NEXT: s_waitcnt vmcnt(0)
562+
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
543563
; GFX11-NEXT: v_mov_b32_e32 v0, 0
544-
; GFX11-NEXT: ; %bb.4: ; %endif
564+
; GFX11-NEXT: .LBB10_4: ; %endif
545565
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
546566
; GFX11-NEXT: v_mov_b32_e32 v2, 0
547567
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]

0 commit comments

Comments
 (0)