Skip to content

[AMDGPU] Fold multiple aligned v_mov_b32 to v_mov_b64 on gfx942 #138843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 100 additions & 3 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class SIFoldOperandsImpl {
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
bool tryFoldRegSequence(MachineInstr &MI);
bool tryFoldImmRegSequence(MachineInstr &MI);
bool tryFoldPhiAGPR(MachineInstr &MI);
bool tryFoldLoad(MachineInstr &MI);

Expand Down Expand Up @@ -2194,6 +2195,100 @@ bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
return true;
}

// gfx942+ can use V_MOV_B64 for materializing constant immediates.
// For example:
// %0:vgpr_32 = V_MOV_B32 0, implicit $exec
// %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
// ->
// %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
assert(MI.isRegSequence());
auto Reg = MI.getOperand(0).getReg();
const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
const MCInstrDesc &MovDesc = TII->get(AMDGPU::V_MOV_B64_PSEUDO);
const TargetRegisterClass *RC =
TII->getRegClass(MovDesc, 0, TRI, *MI.getMF());

if (!ST->hasMovB64() || !TRI->isVGPR(*MRI, Reg) ||
!MRI->hasOneNonDBGUse(Reg) ||
(!TRI->getCompatibleSubRegClass(DefRC, RC, AMDGPU::sub0_sub1) &&
DefRC != RC))
return false;

SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, Reg))
return false;

// Only attempting to fold immediate materializations.
if (!Defs.empty() &&
std::any_of(Defs.begin(), Defs.end(),
[](const std::pair<MachineOperand *, unsigned> &Op) {
return !Op.first->isImm();
}))
return false;

SmallVector<uint64_t, 8> ImmVals;
uint64_t ImmVal = 0;
uint64_t ImmSize = 0;
for (auto &[Op, SubIdx] : Defs) {
unsigned SubRegSize = TRI->getSubRegIdxSize(SubIdx);
unsigned Shift = (TRI->getChannelFromSubReg(SubIdx) % 2) * SubRegSize;
ImmSize += SubRegSize;
ImmVal |= Op->getImm() << Shift;

if (ImmSize > 64 || SubRegSize == 64)
return false;

if (ImmSize == 64) {
// Only 32 bit literals can be encoded.
if (!isUInt<32>(ImmVal))
return false;
ImmVals.push_back(ImmVal);
ImmVal = 0;
ImmSize = 0;
}
}

// Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
if (DefRC == RC) {
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), MovDesc, Reg)
.addImm(ImmVals[0]);
MI.eraseFromParent();
return true;
}

if (ImmVals.size() == 1)
return false;

// Can't bail from here on out: modifying the MI.

// Remove source operands.
for (unsigned i = MI.getNumOperands() - 1; i > 0; --i)
MI.removeOperand(i);

unsigned Ch = 0;
for (uint64_t Val : ImmVals) {
Register MovReg = MRI->createVirtualRegister(RC);
// Duplicate vmov imm materializations (e.g., splatted operands) should get
// combined by MachineCSE pass.
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B64_PSEUDO), MovReg)
.addImm(Val);

// 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
unsigned SubReg64B =
SIRegisterInfo::getSubRegFromChannel(/*Channel=*/Ch * 2, /*SubRegs=*/2);

MI.addOperand(MachineOperand::CreateReg(MovReg, /*isDef=*/false));
MI.addOperand(MachineOperand::CreateImm(SubReg64B));
++Ch;
}

LLVM_DEBUG(dbgs() << "Folded into " << MI);

return true;
}

// Try to fold a reg_sequence with vgpr output and agpr inputs into an
// instruction which can take an agpr. So far that means a store.
bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
Expand Down Expand Up @@ -2623,9 +2718,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
continue;
}

if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
Changed = true;
continue;
if (MI.isRegSequence()) {
if (tryFoldImmRegSequence(MI) || tryFoldRegSequence(MI)) {
Changed = true;
continue;
}
}

if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4139,8 +4139,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX942-LABEL: store_load_i64_aligned:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
Expand Down Expand Up @@ -4250,8 +4249,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX942-LABEL: store_load_i64_unaligned:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v2, 15
; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
Expand Down Expand Up @@ -5010,10 +5008,8 @@ define amdgpu_ps void @large_offset() {
;
; GFX942-LABEL: large_offset:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
Expand Down
30 changes: 20 additions & 10 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ declare i32 @llvm.amdgcn.workitem.id.x()
; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
; GCN-DAG: s_load_dwordx16
; GCN-DAG: s_load_dwordx16
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
Expand All @@ -32,8 +34,10 @@ bb:

; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
; GCN-DAG: s_load_dwordx16
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
Expand All @@ -51,8 +55,10 @@ bb:

; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
; GCN-DAG: s_load_dwordx4
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
Expand All @@ -70,8 +76,10 @@ bb:

; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
; GCN-DAG: s_load_dwordx16
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
Expand All @@ -89,8 +97,10 @@ bb:

; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
; GCN-DAG: s_load_dwordx4
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
; GFX942: v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
Expand Down
87 changes: 37 additions & 50 deletions llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB0_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX942-NEXT: .LBB0_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -30,14 +30,13 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB1_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX942-NEXT: .LBB1_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -55,14 +54,13 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB2_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX942-NEXT: .LBB2_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -80,20 +78,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB3_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX942-NEXT: .LBB3_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -111,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b32_e32 v4, v0
; GFX942-NEXT: v_mov_b32_e32 v5, v0
; GFX942-NEXT: v_mov_b32_e32 v6, v0
; GFX942-NEXT: v_mov_b32_e32 v7, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB4_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX942-NEXT: .LBB4_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -142,14 +132,13 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB5_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX942-NEXT: .LBB5_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -167,14 +156,13 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB6_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX942-NEXT: .LBB6_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -192,14 +180,13 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: v_mov_b32_e32 v1, v0
; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: v_mov_b32_e32 v3, v0
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX942-NEXT: s_cbranch_execz .LBB7_2
; GFX942-NEXT: ; %bb.1: ; %cond.load
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX942-NEXT: .LBB7_2:
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_waitcnt vmcnt(0)
Expand Down
Loading
Loading